From a98e716892ff1ac9060a24eba0957ffaeea8e82f Mon Sep 17 00:00:00 2001 From: Javier Chavarri Date: Mon, 11 Dec 2023 18:38:56 +0000 Subject: [PATCH 1/8] inline Netencoding.Url into Web --- devkit.opam | 1 - dune | 5 +- httpev.ml | 2 +- web.ml | 504 +++++++++++++++++++++++++++++++++++++++++++++++++++- 4 files changed, 503 insertions(+), 9 deletions(-) diff --git a/devkit.opam b/devkit.opam index 95daa62..dfe9978 100644 --- a/devkit.opam +++ b/devkit.opam @@ -18,7 +18,6 @@ depends: [ "camlzip" "libevent" {>= "0.8.0"} "ocurl" {>= "0.7.2"} - "ocamlnet" "pcre" "extunix" {>= "0.1.4"} "lwt" {>= "2.5.2"} diff --git a/dune b/dune index 6737877..43877a8 100644 --- a/dune +++ b/dune @@ -16,9 +16,10 @@ libevent lwt lwt.unix - netstring pcre stdlib-shims + str + unix yojson zip) (modules :standard \ @@ -59,7 +60,7 @@ (executable (name test) - (libraries devkit extlib extunix libevent netstring ounit2 yojson) + (libraries devkit extlib extunix libevent ounit2 yojson) (modules test test_httpev)) (rule diff --git a/httpev.ml b/httpev.ml index 0fc6db4..ed668a9 100644 --- a/httpev.ml +++ b/httpev.ml @@ -184,7 +184,7 @@ let get_content_length headers = | Some s -> try Some (int_of_string s) with _ -> failed Header (sprintf "content-length %S" s) let decode_args s = - try Netencoding.Url.dest_url_encoded_parameters s with exn -> Exn.fail ~exn "decode_args : %S" s + try Web.Url.dest_url_encoded_parameters s with exn -> Exn.fail ~exn "decode_args : %S" s let acceptable_encoding headers = let split s c = List.map (String.strip ~chars:" \t\r\n") @@ Stre.nsplitc s c in diff --git a/web.ml b/web.ml index 8e94d02..65c702e 100644 --- a/web.ml +++ b/web.ml @@ -2,23 +2,517 @@ open ExtLib open Printf - open Prelude open Control let log = Log.self +module Url : sig + (** Encoding/Decoding within URLs: + * + * The following two functions perform the '%'-substitution for + * characters that may otherwise be interpreted as metacharacters. + * + * According to: RFC 1738, RFC 1630 + * + * Option [plus]: This option has been added because there are some + * implementations that do not map ' ' to '+', for example Javascript's + * [escape] function. The default is [true] because this is the RFC- + * compliant definition. + *) + + (** There are no tstring and polymorphic versions of the encode and + decode functions, as URLs are comparatively short, and it is + considered as acceptable for the user to convert types as needed, + even if strings need to be copied for that. + *) + + val decode : ?plus:bool -> ?pos:int -> ?len:int -> string -> string + (** Option [plus]: Whether '+' is converted to space. The default + * is true. If false, '+' is returned as it is. + * + * The optional arguments [pos] and [len] may restrict the string + * to process to this substring. + *) + + val encode : ?plus:bool -> string -> string + (** Option [plus]: Whether spaces are converted to '+'. The default + * is true. If false, spaces are converted to "%20", and + * only %xx sequences are produced. + *) + + val dest_url_encoded_parameters : string -> (string * string) list + (** The argument is the URL-encoded parameter string. The result is + * the corresponding list of (name,value) pairs. + * Note: Whitespace within the parameter string is ignored. + * If there is a format error, the function fails. + *) +end = struct + module Netstring_str : sig + type regexp + (** The type of regular expressions *) + + type split_result = Str.split_result = + | Text of string + | Delim of string (** Here we keep compatibility with [Str] *) + + type result + (** The type of matching results *) + + val regexp : string -> regexp + (** Parses a regexp *) + + val matched_string : result -> string -> string + (** Extracts the matched part from the string. The string argument + * must be the same string passed to [string_match] or the search + * functions, and the result argument must be the corresponding + * result. + *) + + val match_beginning : result -> int + (** Returns the position where the matched part begins *) + + val full_split : regexp -> string -> split_result list + (** Like [split_delim], but returns the delimiters in the result *) + + val global_substitute : + regexp -> (result -> string -> string) -> string -> string + (** [global_substitute re subst s]: Applies the substitution function + * [subst] to all matchings of [re] in [s], and returns the + * transformed string. [subst] is called with the current [result] + * of the match and the whole string [s]. + *) + end = struct + type setatom = Schar of char | Srange of (char * char) + and set = setatom list + + type re_term = + | Texact of string (* literal characters (except NUL) *) + | Tnullchar (* NUL characer *) + | Tany (* . but no newline *) + | Tnull (* emptiness *) + | Tconcat of re_term list + | Tstar of re_term (* x* *) + | Tplus of re_term (* x+ *) + | Toption of re_term (* x? *) + | Tset of set (* [...] *) + | Tnegset of set (* [^...] *) + | Tbegline (* ^ *) + | Tendline (* $ *) + | Talt of re_term list (* x\|y *) + | Tgroup of (int * re_term) (* \(...\) *) + | Trefer of int (* \i *) + | Twordbound (* \b *) + + (**********************************************************************) + (* Final types *) + + type regexp = Pcre.regexp + type split_result = Str.split_result = Text of string | Delim of string + type result = Pcre.substrings + + (**********************************************************************) + (* Parse Str-style regexps, and convert to Pcre-style regexps *) + + let scan_str_regexp re_string = + let l = String.length re_string in + let k = ref (-1) in + let c = ref ' ' in + let esc = ref false in + let group = ref 1 in + let n_open_groups = ref 0 in + let closed_groups = Array.create 10 false in + + let next () = + incr k; + if !k < l then + let c1 = re_string.[!k] in + if c1 = '\\' then + if !k < l then ( + incr k; + c := re_string.[!k]; + esc := true) + else failwith "Web.Url.Netstring_str regexp: bad backslash" + else ( + esc := false; + c := c1) + in + + let next_noesc () = + incr k; + if !k < l then ( + c := re_string.[!k]; + esc := false) + in + + let rec scan_alternative () = + let t1 = scan_concatenation () in + if !k < l then + if !esc && !c = '|' then ( + next (); + match scan_alternative () with + | Talt alist -> Talt (t1 :: alist) + | t -> Talt [ t1; t ]) + else t1 + else t1 + and scan_concatenation () = + let t1 = scan_repetition () in + if t1 = Tnull then t1 + else + let t2 = scan_concatenation () in + match t2 with + | Tnull -> t1 + | Texact s2 -> ( + match t1 with + | Texact s1 -> Texact (s1 ^ s2) + | _ -> Tconcat [ t1; t2 ]) + | Tconcat clist -> Tconcat (t1 :: clist) + | _ -> Tconcat [ t1; t2 ] + and scan_repetition () = + let t1 = ref (scan_literal_or_group ()) in + let continue = ref true in + while !continue do + if !k < l && not !esc then + match !c with + | '*' -> + next (); + t1 := Tstar !t1 + | '+' -> + next (); + t1 := Tplus !t1 + | '?' -> + next (); + t1 := Toption !t1 + (* {...} is not implemented in Str *) + | _ -> continue := false + else continue := false + done; + !t1 + and scan_literal_or_group () = + if !k >= l then Tnull + else if !esc then ( + match !c with + | '(' -> + next (); + let n = !group in + incr group; + incr n_open_groups; + let t = scan_alternative () in + decr n_open_groups; + if !k < l && !esc && !c = ')' then ( + next (); + closed_groups.(n) <- true; + Tgroup (n, t)) + else failwith "regexp: closing paranthesis \\) not found" + | '1' .. '9' -> + let n = Char.code !c - Char.code '0' in + if closed_groups.(n) then ( + next (); + Trefer n) + else failwith "regexp: bad reference to group" + (* + | 'w' -> next(); Twordchar + | 'W' -> next(); Tnowordchar + *) + | 'b' -> + next (); + Twordbound + (* + | 'B' -> next(); Tnowordbound + | '<' -> next(); Twordbeg + | '>' -> next(); Twordend + | '`' -> next(); Tbegbuf + | '\'' -> next(); Tendbuf + *) + | '\\' -> + next (); + Texact (String.make 1 '\\') + | '|' -> Tnull + | ')' -> + if !n_open_groups > 0 then Tnull + else failwith "regexp: unmatched closing parenthesis" + | ch -> + next (); + Texact (String.make 1 ch)) + else + match !c with + | '*' -> Tnull + | '+' -> Tnull + | '?' -> Tnull + | '{' -> Tnull + | '^' -> + next (); + Tbegline + | '$' -> + next (); + Tendline + | '.' -> + next (); + Tany + | '\000' -> + next (); + Tnullchar + | '[' -> + next_noesc (); + if !k < l then ( + let negated = ref false in + let set = ref [] in + + let add_char c = set := Schar c :: !set in + + let add_range c1 c2 = set := Srange (c1, c2) :: !set in + + if !c = '^' then ( + next_noesc (); + negated := true); + + let continue = ref true in + let first = ref true in + + (* the character after [ or [^ ? *) + while !continue && !k < l do + match () with + | () when !c = '[' && !k + 1 < l && re_string.[!k + 1] = ':' + -> + failwith + "regexp: Character classes such as [[:digit:]] not \ + implemented" + (* TODO: check for predefined sets *) + | () when !c = ']' && not !first -> + next (); + continue := false + | () + when !k + 2 < l + && re_string.[!k + 1] = '-' + && re_string.[!k + 2] <> ']' -> + (* range *) + add_range !c re_string.[!k + 2]; + next_noesc (); + next_noesc (); + next_noesc (); + first := false + | () -> + add_char !c; + next_noesc (); + first := false + done; + + if !continue then failwith "regexp: closing bracket ] not found"; + + if !negated then Tnegset !set else Tset !set) + else failwith "regexp: closing bracket ] not found" + | ch -> + next (); + Texact (String.make 1 ch) + in + + try + next (); + scan_alternative () + with Failure msg -> failwith (msg ^ " - regexp: " ^ re_string) + + let pcre_safe_quote c = + (* for print_set *) + match c with + | 'a' .. 'z' | 'A' .. 'Z' | '0' .. '9' | '_' -> String.make 1 c + | '\000' -> "\\000" + | _ -> "\\" ^ String.make 1 c + + let rec print_pcre_regexp ret = + match ret with + | Texact s -> Pcre.quote s + | Tnullchar -> + (* Pcre.quote "\000" returns nonsense *) + "[\\000]" + | Tany -> "." + | Tnull -> "(?:)" + | Tconcat l -> String.concat "" (List.map print_pcre_regexp l) + | Tstar ret' -> print_pcre_subregexp ret' ^ "*" + | Tplus ret' -> print_pcre_subregexp ret' ^ "+" + | Toption ret' -> print_pcre_subregexp ret' ^ "?" + | Tset s -> "[" ^ print_set s ^ "]" + | Tnegset s -> "[^" ^ print_set s ^ "]" + | Talt l -> String.concat "|" (List.map print_pcre_subregexp l) + | Tgroup (_, ret') -> "(" ^ print_pcre_regexp ret' ^ ")" + | Trefer n -> + (* Put parentheses around \n to disambiguate from \nn *) + "(?:\\" ^ string_of_int n ^ ")" + | Tbegline -> "^" + | Tendline -> "(?:$)" + | Twordbound -> "\\b" + + and print_pcre_subregexp ret = + (* Print ret, but put parentheses around ret *) + match ret with + | Tset _ | Tnegset _ | Tgroup (_, _) -> + (* No additional parentheses needed *) + print_pcre_regexp ret + | _ -> + (* Print (?:ret). This is the "neutral" form of grouping that only + * changes precedence + *) + "(?:" ^ print_pcre_regexp ret ^ ")" + + and print_set s = + String.concat "" + (List.map + (function + | Schar c -> pcre_safe_quote c + | Srange (c1, c2) -> pcre_safe_quote c1 ^ "-" ^ pcre_safe_quote c2) + s) + + (**********************************************************************) + (* Emulation *) + + let regexp s = + let ret = scan_str_regexp s in + let s' = print_pcre_regexp ret in + Pcre.regexp ~flags:[ `MULTILINE ] s' + + let matched_string result _ = + (* Unfortunately, Pcre.get_substring will not raise Not_found if there is + * no matched string. Instead, it returns "", but this value cannot be + * distinguished from an empty match. + * The workaround is to call Pcre.get_substring_ofs first. This function + * will raise Not_found if there is not any matched string. + * + * NOTE: Current versions of Pcre do return Not_found! + *) + ignore (Pcre.get_substring_ofs result 0); + Pcre.get_substring result 0 + + let match_beginning result = fst (Pcre.get_substring_ofs result 0) + + let global_substitute pat subst s = + Pcre.substitute_substrings ~rex:pat ~subst:(fun r -> subst r s) s + + let tr_split_result r = + List.map + (function + | Pcre.Text t -> Text t | Pcre.Delim d -> Delim d | _ -> assert false) + (List.filter + (function Pcre.Group (_, _) | Pcre.NoGroup -> false | _ -> true) + r) + + let full_split sep s = + tr_split_result (Pcre.full_split ~rex:sep ~max:(-1) s) + + end + + (* copied from https://gitlab.com/gerdstolpmann/lib-ocamlnet3/-/blob/4d1a8401bd40c17632128545e2aa4c880535e208/code/src/netstring/netencoding.ml#L993 *) + let hex_digits = + [| + '0'; + '1'; + '2'; + '3'; + '4'; + '5'; + '6'; + '7'; + '8'; + '9'; + 'A'; + 'B'; + 'C'; + 'D'; + 'E'; + 'F'; + |] + + let to_hex2 k = + (* Converts k to a 2-digit hex string *) + let s = Bytes.create 2 in + Bytes.set s 0 hex_digits.((k lsr 4) land 15); + Bytes.set s 1 hex_digits.(k land 15); + Bytes.unsafe_to_string s + + let of_hex1 c = + match c with + | '0' .. '9' -> Char.code c - Char.code '0' + | 'A' .. 'F' -> Char.code c - Char.code 'A' + 10 + | 'a' .. 'f' -> Char.code c - Char.code 'a' + 10 + | _ -> raise Not_found + + let url_encoding_re = Netstring_str.regexp "[^A-Za-z0-9_.!*-]" + let url_decoding_re = Netstring_str.regexp "\\+\\|%..\\|%.\\|%" + + let encode ?(plus = true) s = + Netstring_str.global_substitute url_encoding_re + (fun r _ -> + match Netstring_str.matched_string r s with + | " " when plus -> "+" + | x -> + let k = Char.code x.[0] in + "%" ^ to_hex2 k) + s + + let decode ?(plus = true) ?(pos = 0) ?len s = + let s_l = String.length s in + let s1 = + if pos = 0 && len = None then s + else + let len = match len with Some n -> n | None -> s_l in + String.sub s pos len + in + let l = String.length s1 in + Netstring_str.global_substitute url_decoding_re + (fun r _ -> + match Netstring_str.matched_string r s1 with + | "+" -> if plus then " " else "+" + | _ -> ( + let i = Netstring_str.match_beginning r in + (* Assertion: s1.[i] = '%' *) + if i + 2 >= l then failwith "Web.Url.decode"; + let c1 = s1.[i + 1] in + let c2 = s1.[i + 2] in + try + let k1 = of_hex1 c1 in + let k2 = of_hex1 c2 in + String.make 1 (Char.chr ((k1 lsl 4) lor k2)) + with Not_found -> failwith "Web.Url.decode")) + s1 + + let url_split_re = Netstring_str.regexp "[&=]" + + let dest_url_encoded_parameters parstr = + let rec parse_after_amp tl = + match tl with + | Netstring_str.Text name + :: Netstring_str.Delim "=" + :: Netstring_str.Text value + :: tl' -> + (decode name, decode value) :: parse_next tl' + | Netstring_str.Text name + :: Netstring_str.Delim "=" + :: Netstring_str.Delim "&" + :: tl' -> + (decode name, "") :: parse_after_amp tl' + | [ Netstring_str.Text name; Netstring_str.Delim "=" ] -> + [ (decode name, "") ] + | _ -> failwith "Web.Url.dest_url_encoded_parameters" + and parse_next tl = + match tl with + | [] -> [] + | Netstring_str.Delim "&" :: tl' -> parse_after_amp tl' + | _ -> failwith "Web.Url.dest_url_encoded_parameters" + in + let toklist = Netstring_str.full_split url_split_re parstr in + match toklist with [] -> [] | _ -> parse_after_amp toklist + +end + (** percent-encode (convert space into %20) *) -let rawurlencode = Netencoding.Url.encode ~plus:false +let rawurlencode = Url.encode ~plus:false (** percent-encode, but convert space into plus, not %20 *) -let urlencode = Netencoding.Url.encode ~plus:true +let urlencode = Url.encode ~plus:true (** percent-decode (leave plus as is) *) -let rawurldecode s = try Netencoding.Url.decode ~plus:false s with _ -> s +let rawurldecode s = try Url.decode ~plus:false s with _ -> s (** percent-decode and convert plus into space *) -let urldecode s = try Netencoding.Url.decode ~plus:true s with _ -> s +let urldecode s = try Url.decode ~plus:true s with _ -> s let htmlencode = Netencoding.Html.encode ~in_enc:`Enc_utf8 ~out_enc:`Enc_utf8 () let htmldecode_exn = Netencoding.Html.decode ~in_enc:`Enc_utf8 ~out_enc:`Enc_utf8 () From f482555bfc030c46c7975e88a8109f3f877a125d Mon Sep 17 00:00:00 2001 From: Javier Chavarri Date: Tue, 12 Dec 2023 09:27:56 +0000 Subject: [PATCH 2/8] move ocamlnet to its own lib --- dune | 1 + ocamlnet_lite/dune | 7 + ocamlnet_lite/netstring_str.ml | 316 ++++++++++++++++++++ ocamlnet_lite/netstring_str.mli | 33 +++ ocamlnet_lite/url.ml | 100 +++++++ ocamlnet_lite/url.mli | 39 +++ web.ml | 496 +------------------------------- 7 files changed, 497 insertions(+), 495 deletions(-) create mode 100644 ocamlnet_lite/dune create mode 100644 ocamlnet_lite/netstring_str.ml create mode 100644 ocamlnet_lite/netstring_str.mli create mode 100644 ocamlnet_lite/url.ml create mode 100644 ocamlnet_lite/url.mli diff --git a/dune b/dune index 43877a8..27d8bc1 100644 --- a/dune +++ b/dune @@ -16,6 +16,7 @@ libevent lwt lwt.unix + ocamlnet_lite pcre stdlib-shims str diff --git a/ocamlnet_lite/dune b/ocamlnet_lite/dune new file mode 100644 index 0000000..e32763d --- /dev/null +++ b/ocamlnet_lite/dune @@ -0,0 +1,7 @@ +(library + (name ocamlnet_lite) + (public_name devkit.ocamlnet_lite) + (libraries + extlib ; just for Array.create + pcre + str)) diff --git a/ocamlnet_lite/netstring_str.ml b/ocamlnet_lite/netstring_str.ml new file mode 100644 index 0000000..9449fe5 --- /dev/null +++ b/ocamlnet_lite/netstring_str.ml @@ -0,0 +1,316 @@ +open ExtLib + +type setatom = Schar of char | Srange of (char * char) +and set = setatom list + +type re_term = + | Texact of string (* literal characters (except NUL) *) + | Tnullchar (* NUL characer *) + | Tany (* . but no newline *) + | Tnull (* emptiness *) + | Tconcat of re_term list + | Tstar of re_term (* x* *) + | Tplus of re_term (* x+ *) + | Toption of re_term (* x? *) + | Tset of set (* [...] *) + | Tnegset of set (* [^...] *) + | Tbegline (* ^ *) + | Tendline (* $ *) + | Talt of re_term list (* x\|y *) + | Tgroup of (int * re_term) (* \(...\) *) + | Trefer of int (* \i *) + | Twordbound (* \b *) + +(**********************************************************************) +(* Final types *) + +type regexp = Pcre.regexp +type split_result = Str.split_result = Text of string | Delim of string +type result = Pcre.substrings + +(**********************************************************************) +(* Parse Str-style regexps, and convert to Pcre-style regexps *) + +let scan_str_regexp re_string = + let l = String.length re_string in + let k = ref (-1) in + let c = ref ' ' in + let esc = ref false in + let group = ref 1 in + let n_open_groups = ref 0 in + let closed_groups = Array.create 10 false in + + let next () = + incr k; + if !k < l then + let c1 = re_string.[!k] in + if c1 = '\\' then + if !k < l then ( + incr k; + c := re_string.[!k]; + esc := true) + else failwith "Web.Url.Netstring_str regexp: bad backslash" + else ( + esc := false; + c := c1) + in + + let next_noesc () = + incr k; + if !k < l then ( + c := re_string.[!k]; + esc := false) + in + + let rec scan_alternative () = + let t1 = scan_concatenation () in + if !k < l then + if !esc && !c = '|' then ( + next (); + match scan_alternative () with + | Talt alist -> Talt (t1 :: alist) + | t -> Talt [ t1; t ]) + else t1 + else t1 + and scan_concatenation () = + let t1 = scan_repetition () in + if t1 = Tnull then t1 + else + let t2 = scan_concatenation () in + match t2 with + | Tnull -> t1 + | Texact s2 -> ( + match t1 with + | Texact s1 -> Texact (s1 ^ s2) + | _ -> Tconcat [ t1; t2 ]) + | Tconcat clist -> Tconcat (t1 :: clist) + | _ -> Tconcat [ t1; t2 ] + and scan_repetition () = + let t1 = ref (scan_literal_or_group ()) in + let continue = ref true in + while !continue do + if !k < l && not !esc then + match !c with + | '*' -> + next (); + t1 := Tstar !t1 + | '+' -> + next (); + t1 := Tplus !t1 + | '?' -> + next (); + t1 := Toption !t1 + (* {...} is not implemented in Str *) + | _ -> continue := false + else continue := false + done; + !t1 + and scan_literal_or_group () = + if !k >= l then Tnull + else if !esc then ( + match !c with + | '(' -> + next (); + let n = !group in + incr group; + incr n_open_groups; + let t = scan_alternative () in + decr n_open_groups; + if !k < l && !esc && !c = ')' then ( + next (); + closed_groups.(n) <- true; + Tgroup (n, t)) + else failwith "regexp: closing paranthesis \\) not found" + | '1' .. '9' -> + let n = Char.code !c - Char.code '0' in + if closed_groups.(n) then ( + next (); + Trefer n) + else failwith "regexp: bad reference to group" + (* + | 'w' -> next(); Twordchar + | 'W' -> next(); Tnowordchar + *) + | 'b' -> + next (); + Twordbound + (* + | 'B' -> next(); Tnowordbound + | '<' -> next(); Twordbeg + | '>' -> next(); Twordend + | '`' -> next(); Tbegbuf + | '\'' -> next(); Tendbuf + *) + | '\\' -> + next (); + Texact (String.make 1 '\\') + | '|' -> Tnull + | ')' -> + if !n_open_groups > 0 then Tnull + else failwith "regexp: unmatched closing parenthesis" + | ch -> + next (); + Texact (String.make 1 ch)) + else + match !c with + | '*' -> Tnull + | '+' -> Tnull + | '?' -> Tnull + | '{' -> Tnull + | '^' -> + next (); + Tbegline + | '$' -> + next (); + Tendline + | '.' -> + next (); + Tany + | '\000' -> + next (); + Tnullchar + | '[' -> + next_noesc (); + if !k < l then ( + let negated = ref false in + let set = ref [] in + + let add_char c = set := Schar c :: !set in + + let add_range c1 c2 = set := Srange (c1, c2) :: !set in + + if !c = '^' then ( + next_noesc (); + negated := true); + + let continue = ref true in + let first = ref true in + + (* the character after [ or [^ ? *) + while !continue && !k < l do + match () with + | () when !c = '[' && !k + 1 < l && re_string.[!k + 1] = ':' + -> + failwith + "regexp: Character classes such as [[:digit:]] not \ + implemented" + (* TODO: check for predefined sets *) + | () when !c = ']' && not !first -> + next (); + continue := false + | () + when !k + 2 < l + && re_string.[!k + 1] = '-' + && re_string.[!k + 2] <> ']' -> + (* range *) + add_range !c re_string.[!k + 2]; + next_noesc (); + next_noesc (); + next_noesc (); + first := false + | () -> + add_char !c; + next_noesc (); + first := false + done; + + if !continue then failwith "regexp: closing bracket ] not found"; + + if !negated then Tnegset !set else Tset !set) + else failwith "regexp: closing bracket ] not found" + | ch -> + next (); + Texact (String.make 1 ch) + in + + try + next (); + scan_alternative () + with Failure msg -> failwith (msg ^ " - regexp: " ^ re_string) + +let pcre_safe_quote c = + (* for print_set *) + match c with + | 'a' .. 'z' | 'A' .. 'Z' | '0' .. '9' | '_' -> String.make 1 c + | '\000' -> "\\000" + | _ -> "\\" ^ String.make 1 c + +let rec print_pcre_regexp ret = + match ret with + | Texact s -> Pcre.quote s + | Tnullchar -> + (* Pcre.quote "\000" returns nonsense *) + "[\\000]" + | Tany -> "." + | Tnull -> "(?:)" + | Tconcat l -> String.concat "" (List.map print_pcre_regexp l) + | Tstar ret' -> print_pcre_subregexp ret' ^ "*" + | Tplus ret' -> print_pcre_subregexp ret' ^ "+" + | Toption ret' -> print_pcre_subregexp ret' ^ "?" + | Tset s -> "[" ^ print_set s ^ "]" + | Tnegset s -> "[^" ^ print_set s ^ "]" + | Talt l -> String.concat "|" (List.map print_pcre_subregexp l) + | Tgroup (_, ret') -> "(" ^ print_pcre_regexp ret' ^ ")" + | Trefer n -> + (* Put parentheses around \n to disambiguate from \nn *) + "(?:\\" ^ string_of_int n ^ ")" + | Tbegline -> "^" + | Tendline -> "(?:$)" + | Twordbound -> "\\b" + +and print_pcre_subregexp ret = + (* Print ret, but put parentheses around ret *) + match ret with + | Tset _ | Tnegset _ | Tgroup (_, _) -> + (* No additional parentheses needed *) + print_pcre_regexp ret + | _ -> + (* Print (?:ret). This is the "neutral" form of grouping that only + * changes precedence + *) + "(?:" ^ print_pcre_regexp ret ^ ")" + +and print_set s = + String.concat "" + (List.map + (function + | Schar c -> pcre_safe_quote c + | Srange (c1, c2) -> pcre_safe_quote c1 ^ "-" ^ pcre_safe_quote c2) + s) + +(**********************************************************************) +(* Emulation *) + +let regexp s = + let ret = scan_str_regexp s in + let s' = print_pcre_regexp ret in + Pcre.regexp ~flags:[ `MULTILINE ] s' + +let matched_string result _ = + (* Unfortunately, Pcre.get_substring will not raise Not_found if there is + * no matched string. Instead, it returns "", but this value cannot be + * distinguished from an empty match. + * The workaround is to call Pcre.get_substring_ofs first. This function + * will raise Not_found if there is not any matched string. + * + * NOTE: Current versions of Pcre do return Not_found! + *) + ignore (Pcre.get_substring_ofs result 0); + Pcre.get_substring result 0 + +let match_beginning result = fst (Pcre.get_substring_ofs result 0) + +let global_substitute pat subst s = + Pcre.substitute_substrings ~rex:pat ~subst:(fun r -> subst r s) s + +let tr_split_result r = + List.map + (function + | Pcre.Text t -> Text t | Pcre.Delim d -> Delim d | _ -> assert false) + (List.filter + (function Pcre.Group (_, _) | Pcre.NoGroup -> false | _ -> true) + r) + +let full_split sep s = + tr_split_result (Pcre.full_split ~rex:sep ~max:(-1) s) + diff --git a/ocamlnet_lite/netstring_str.mli b/ocamlnet_lite/netstring_str.mli new file mode 100644 index 0000000..e1eae0d --- /dev/null +++ b/ocamlnet_lite/netstring_str.mli @@ -0,0 +1,33 @@ +type regexp +(** The type of regular expressions *) + +type split_result = Str.split_result = + | Text of string + | Delim of string (** Here we keep compatibility with [Str] *) + +type result +(** The type of matching results *) + +val regexp : string -> regexp +(** Parses a regexp *) + +val matched_string : result -> string -> string +(** Extracts the matched part from the string. The string argument + * must be the same string passed to [string_match] or the search + * functions, and the result argument must be the corresponding + * result. + *) + +val match_beginning : result -> int +(** Returns the position where the matched part begins *) + +val full_split : regexp -> string -> split_result list +(** Like [split_delim], but returns the delimiters in the result *) + +val global_substitute : + regexp -> (result -> string -> string) -> string -> string +(** [global_substitute re subst s]: Applies the substitution function + * [subst] to all matchings of [re] in [s], and returns the + * transformed string. [subst] is called with the current [result] + * of the match and the whole string [s]. + *) diff --git a/ocamlnet_lite/url.ml b/ocamlnet_lite/url.ml new file mode 100644 index 0000000..aea3b81 --- /dev/null +++ b/ocamlnet_lite/url.ml @@ -0,0 +1,100 @@ +(* adapted from https://gitlab.com/gerdstolpmann/lib-ocamlnet3/-/blob/4d1a8401bd40c17632128545e2aa4c880535e208/code/src/netstring/netencoding.ml#L993 *) +let hex_digits = + [| + '0'; + '1'; + '2'; + '3'; + '4'; + '5'; + '6'; + '7'; + '8'; + '9'; + 'A'; + 'B'; + 'C'; + 'D'; + 'E'; + 'F'; + |] + +let to_hex2 k = + (* Converts k to a 2-digit hex string *) + let s = Bytes.create 2 in + Bytes.set s 0 hex_digits.((k lsr 4) land 15); + Bytes.set s 1 hex_digits.(k land 15); + Bytes.unsafe_to_string s + +let of_hex1 c = + match c with + | '0' .. '9' -> Char.code c - Char.code '0' + | 'A' .. 'F' -> Char.code c - Char.code 'A' + 10 + | 'a' .. 'f' -> Char.code c - Char.code 'a' + 10 + | _ -> raise Not_found + +let url_encoding_re = Netstring_str.regexp "[^A-Za-z0-9_.!*-]" +let url_decoding_re = Netstring_str.regexp "\\+\\|%..\\|%.\\|%" + +let encode ?(plus = true) s = + Netstring_str.global_substitute url_encoding_re + (fun r _ -> + match Netstring_str.matched_string r s with + | " " when plus -> "+" + | x -> + let k = Char.code x.[0] in + "%" ^ to_hex2 k) + s + +let decode ?(plus = true) ?(pos = 0) ?len s = + let s_l = String.length s in + let s1 = + if pos = 0 && len = None then s + else + let len = match len with Some n -> n | None -> s_l in + String.sub s pos len + in + let l = String.length s1 in + Netstring_str.global_substitute url_decoding_re + (fun r _ -> + match Netstring_str.matched_string r s1 with + | "+" -> if plus then " " else "+" + | _ -> ( + let i = Netstring_str.match_beginning r in + (* Assertion: s1.[i] = '%' *) + if i + 2 >= l then failwith "Web.Url.decode"; + let c1 = s1.[i + 1] in + let c2 = s1.[i + 2] in + try + let k1 = of_hex1 c1 in + let k2 = of_hex1 c2 in + String.make 1 (Char.chr ((k1 lsl 4) lor k2)) + with Not_found -> failwith "Web.Url.decode")) + s1 + +let url_split_re = Netstring_str.regexp "[&=]" + +let dest_url_encoded_parameters parstr = + let rec parse_after_amp tl = + match tl with + | Netstring_str.Text name + :: Netstring_str.Delim "=" + :: Netstring_str.Text value + :: tl' -> + (decode name, decode value) :: parse_next tl' + | Netstring_str.Text name + :: Netstring_str.Delim "=" + :: Netstring_str.Delim "&" + :: tl' -> + (decode name, "") :: parse_after_amp tl' + | [ Netstring_str.Text name; Netstring_str.Delim "=" ] -> + [ (decode name, "") ] + | _ -> failwith "Web.Url.dest_url_encoded_parameters" + and parse_next tl = + match tl with + | [] -> [] + | Netstring_str.Delim "&" :: tl' -> parse_after_amp tl' + | _ -> failwith "Web.Url.dest_url_encoded_parameters" + in + let toklist = Netstring_str.full_split url_split_re parstr in + match toklist with [] -> [] | _ -> parse_after_amp toklist diff --git a/ocamlnet_lite/url.mli b/ocamlnet_lite/url.mli new file mode 100644 index 0000000..980f914 --- /dev/null +++ b/ocamlnet_lite/url.mli @@ -0,0 +1,39 @@ +(** Encoding/Decoding within URLs: + * + * The following two functions perform the '%'-substitution for + * characters that may otherwise be interpreted as metacharacters. + * + * According to: RFC 1738, RFC 1630 + * + * Option [plus]: This option has been added because there are some + * implementations that do not map ' ' to '+', for example Javascript's + * [escape] function. The default is [true] because this is the RFC- + * compliant definition. + *) + +(** There are no tstring and polymorphic versions of the encode and + decode functions, as URLs are comparatively short, and it is + considered as acceptable for the user to convert types as needed, + even if strings need to be copied for that. +*) + +val decode : ?plus:bool -> ?pos:int -> ?len:int -> string -> string +(** Option [plus]: Whether '+' is converted to space. The default + * is true. If false, '+' is returned as it is. + * + * The optional arguments [pos] and [len] may restrict the string + * to process to this substring. + *) + +val encode : ?plus:bool -> string -> string +(** Option [plus]: Whether spaces are converted to '+'. The default + * is true. If false, spaces are converted to "%20", and + * only %xx sequences are produced. + *) + +val dest_url_encoded_parameters : string -> (string * string) list +(** The argument is the URL-encoded parameter string. The result is + * the corresponding list of (name,value) pairs. + * Note: Whitespace within the parameter string is ignored. + * If there is a format error, the function fails. + *) diff --git a/web.ml b/web.ml index 65c702e..21422e9 100644 --- a/web.ml +++ b/web.ml @@ -4,504 +4,10 @@ open ExtLib open Printf open Prelude open Control +open Ocamlnet_lite let log = Log.self -module Url : sig - (** Encoding/Decoding within URLs: - * - * The following two functions perform the '%'-substitution for - * characters that may otherwise be interpreted as metacharacters. - * - * According to: RFC 1738, RFC 1630 - * - * Option [plus]: This option has been added because there are some - * implementations that do not map ' ' to '+', for example Javascript's - * [escape] function. The default is [true] because this is the RFC- - * compliant definition. - *) - - (** There are no tstring and polymorphic versions of the encode and - decode functions, as URLs are comparatively short, and it is - considered as acceptable for the user to convert types as needed, - even if strings need to be copied for that. - *) - - val decode : ?plus:bool -> ?pos:int -> ?len:int -> string -> string - (** Option [plus]: Whether '+' is converted to space. The default - * is true. If false, '+' is returned as it is. - * - * The optional arguments [pos] and [len] may restrict the string - * to process to this substring. - *) - - val encode : ?plus:bool -> string -> string - (** Option [plus]: Whether spaces are converted to '+'. The default - * is true. If false, spaces are converted to "%20", and - * only %xx sequences are produced. - *) - - val dest_url_encoded_parameters : string -> (string * string) list - (** The argument is the URL-encoded parameter string. The result is - * the corresponding list of (name,value) pairs. - * Note: Whitespace within the parameter string is ignored. - * If there is a format error, the function fails. - *) -end = struct - module Netstring_str : sig - type regexp - (** The type of regular expressions *) - - type split_result = Str.split_result = - | Text of string - | Delim of string (** Here we keep compatibility with [Str] *) - - type result - (** The type of matching results *) - - val regexp : string -> regexp - (** Parses a regexp *) - - val matched_string : result -> string -> string - (** Extracts the matched part from the string. The string argument - * must be the same string passed to [string_match] or the search - * functions, and the result argument must be the corresponding - * result. - *) - - val match_beginning : result -> int - (** Returns the position where the matched part begins *) - - val full_split : regexp -> string -> split_result list - (** Like [split_delim], but returns the delimiters in the result *) - - val global_substitute : - regexp -> (result -> string -> string) -> string -> string - (** [global_substitute re subst s]: Applies the substitution function - * [subst] to all matchings of [re] in [s], and returns the - * transformed string. [subst] is called with the current [result] - * of the match and the whole string [s]. - *) - end = struct - type setatom = Schar of char | Srange of (char * char) - and set = setatom list - - type re_term = - | Texact of string (* literal characters (except NUL) *) - | Tnullchar (* NUL characer *) - | Tany (* . but no newline *) - | Tnull (* emptiness *) - | Tconcat of re_term list - | Tstar of re_term (* x* *) - | Tplus of re_term (* x+ *) - | Toption of re_term (* x? *) - | Tset of set (* [...] *) - | Tnegset of set (* [^...] *) - | Tbegline (* ^ *) - | Tendline (* $ *) - | Talt of re_term list (* x\|y *) - | Tgroup of (int * re_term) (* \(...\) *) - | Trefer of int (* \i *) - | Twordbound (* \b *) - - (**********************************************************************) - (* Final types *) - - type regexp = Pcre.regexp - type split_result = Str.split_result = Text of string | Delim of string - type result = Pcre.substrings - - (**********************************************************************) - (* Parse Str-style regexps, and convert to Pcre-style regexps *) - - let scan_str_regexp re_string = - let l = String.length re_string in - let k = ref (-1) in - let c = ref ' ' in - let esc = ref false in - let group = ref 1 in - let n_open_groups = ref 0 in - let closed_groups = Array.create 10 false in - - let next () = - incr k; - if !k < l then - let c1 = re_string.[!k] in - if c1 = '\\' then - if !k < l then ( - incr k; - c := re_string.[!k]; - esc := true) - else failwith "Web.Url.Netstring_str regexp: bad backslash" - else ( - esc := false; - c := c1) - in - - let next_noesc () = - incr k; - if !k < l then ( - c := re_string.[!k]; - esc := false) - in - - let rec scan_alternative () = - let t1 = scan_concatenation () in - if !k < l then - if !esc && !c = '|' then ( - next (); - match scan_alternative () with - | Talt alist -> Talt (t1 :: alist) - | t -> Talt [ t1; t ]) - else t1 - else t1 - and scan_concatenation () = - let t1 = scan_repetition () in - if t1 = Tnull then t1 - else - let t2 = scan_concatenation () in - match t2 with - | Tnull -> t1 - | Texact s2 -> ( - match t1 with - | Texact s1 -> Texact (s1 ^ s2) - | _ -> Tconcat [ t1; t2 ]) - | Tconcat clist -> Tconcat (t1 :: clist) - | _ -> Tconcat [ t1; t2 ] - and scan_repetition () = - let t1 = ref (scan_literal_or_group ()) in - let continue = ref true in - while !continue do - if !k < l && not !esc then - match !c with - | '*' -> - next (); - t1 := Tstar !t1 - | '+' -> - next (); - t1 := Tplus !t1 - | '?' -> - next (); - t1 := Toption !t1 - (* {...} is not implemented in Str *) - | _ -> continue := false - else continue := false - done; - !t1 - and scan_literal_or_group () = - if !k >= l then Tnull - else if !esc then ( - match !c with - | '(' -> - next (); - let n = !group in - incr group; - incr n_open_groups; - let t = scan_alternative () in - decr n_open_groups; - if !k < l && !esc && !c = ')' then ( - next (); - closed_groups.(n) <- true; - Tgroup (n, t)) - else failwith "regexp: closing paranthesis \\) not found" - | '1' .. '9' -> - let n = Char.code !c - Char.code '0' in - if closed_groups.(n) then ( - next (); - Trefer n) - else failwith "regexp: bad reference to group" - (* - | 'w' -> next(); Twordchar - | 'W' -> next(); Tnowordchar - *) - | 'b' -> - next (); - Twordbound - (* - | 'B' -> next(); Tnowordbound - | '<' -> next(); Twordbeg - | '>' -> next(); Twordend - | '`' -> next(); Tbegbuf - | '\'' -> next(); Tendbuf - *) - | '\\' -> - next (); - Texact (String.make 1 '\\') - | '|' -> Tnull - | ')' -> - if !n_open_groups > 0 then Tnull - else failwith "regexp: unmatched closing parenthesis" - | ch -> - next (); - Texact (String.make 1 ch)) - else - match !c with - | '*' -> Tnull - | '+' -> Tnull - | '?' -> Tnull - | '{' -> Tnull - | '^' -> - next (); - Tbegline - | '$' -> - next (); - Tendline - | '.' -> - next (); - Tany - | '\000' -> - next (); - Tnullchar - | '[' -> - next_noesc (); - if !k < l then ( - let negated = ref false in - let set = ref [] in - - let add_char c = set := Schar c :: !set in - - let add_range c1 c2 = set := Srange (c1, c2) :: !set in - - if !c = '^' then ( - next_noesc (); - negated := true); - - let continue = ref true in - let first = ref true in - - (* the character after [ or [^ ? *) - while !continue && !k < l do - match () with - | () when !c = '[' && !k + 1 < l && re_string.[!k + 1] = ':' - -> - failwith - "regexp: Character classes such as [[:digit:]] not \ - implemented" - (* TODO: check for predefined sets *) - | () when !c = ']' && not !first -> - next (); - continue := false - | () - when !k + 2 < l - && re_string.[!k + 1] = '-' - && re_string.[!k + 2] <> ']' -> - (* range *) - add_range !c re_string.[!k + 2]; - next_noesc (); - next_noesc (); - next_noesc (); - first := false - | () -> - add_char !c; - next_noesc (); - first := false - done; - - if !continue then failwith "regexp: closing bracket ] not found"; - - if !negated then Tnegset !set else Tset !set) - else failwith "regexp: closing bracket ] not found" - | ch -> - next (); - Texact (String.make 1 ch) - in - - try - next (); - scan_alternative () - with Failure msg -> failwith (msg ^ " - regexp: " ^ re_string) - - let pcre_safe_quote c = - (* for print_set *) - match c with - | 'a' .. 'z' | 'A' .. 'Z' | '0' .. '9' | '_' -> String.make 1 c - | '\000' -> "\\000" - | _ -> "\\" ^ String.make 1 c - - let rec print_pcre_regexp ret = - match ret with - | Texact s -> Pcre.quote s - | Tnullchar -> - (* Pcre.quote "\000" returns nonsense *) - "[\\000]" - | Tany -> "." - | Tnull -> "(?:)" - | Tconcat l -> String.concat "" (List.map print_pcre_regexp l) - | Tstar ret' -> print_pcre_subregexp ret' ^ "*" - | Tplus ret' -> print_pcre_subregexp ret' ^ "+" - | Toption ret' -> print_pcre_subregexp ret' ^ "?" - | Tset s -> "[" ^ print_set s ^ "]" - | Tnegset s -> "[^" ^ print_set s ^ "]" - | Talt l -> String.concat "|" (List.map print_pcre_subregexp l) - | Tgroup (_, ret') -> "(" ^ print_pcre_regexp ret' ^ ")" - | Trefer n -> - (* Put parentheses around \n to disambiguate from \nn *) - "(?:\\" ^ string_of_int n ^ ")" - | Tbegline -> "^" - | Tendline -> "(?:$)" - | Twordbound -> "\\b" - - and print_pcre_subregexp ret = - (* Print ret, but put parentheses around ret *) - match ret with - | Tset _ | Tnegset _ | Tgroup (_, _) -> - (* No additional parentheses needed *) - print_pcre_regexp ret - | _ -> - (* Print (?:ret). This is the "neutral" form of grouping that only - * changes precedence - *) - "(?:" ^ print_pcre_regexp ret ^ ")" - - and print_set s = - String.concat "" - (List.map - (function - | Schar c -> pcre_safe_quote c - | Srange (c1, c2) -> pcre_safe_quote c1 ^ "-" ^ pcre_safe_quote c2) - s) - - (**********************************************************************) - (* Emulation *) - - let regexp s = - let ret = scan_str_regexp s in - let s' = print_pcre_regexp ret in - Pcre.regexp ~flags:[ `MULTILINE ] s' - - let matched_string result _ = - (* Unfortunately, Pcre.get_substring will not raise Not_found if there is - * no matched string. Instead, it returns "", but this value cannot be - * distinguished from an empty match. - * The workaround is to call Pcre.get_substring_ofs first. This function - * will raise Not_found if there is not any matched string. - * - * NOTE: Current versions of Pcre do return Not_found! - *) - ignore (Pcre.get_substring_ofs result 0); - Pcre.get_substring result 0 - - let match_beginning result = fst (Pcre.get_substring_ofs result 0) - - let global_substitute pat subst s = - Pcre.substitute_substrings ~rex:pat ~subst:(fun r -> subst r s) s - - let tr_split_result r = - List.map - (function - | Pcre.Text t -> Text t | Pcre.Delim d -> Delim d | _ -> assert false) - (List.filter - (function Pcre.Group (_, _) | Pcre.NoGroup -> false | _ -> true) - r) - - let full_split sep s = - tr_split_result (Pcre.full_split ~rex:sep ~max:(-1) s) - - end - - (* copied from https://gitlab.com/gerdstolpmann/lib-ocamlnet3/-/blob/4d1a8401bd40c17632128545e2aa4c880535e208/code/src/netstring/netencoding.ml#L993 *) - let hex_digits = - [| - '0'; - '1'; - '2'; - '3'; - '4'; - '5'; - '6'; - '7'; - '8'; - '9'; - 'A'; - 'B'; - 'C'; - 'D'; - 'E'; - 'F'; - |] - - let to_hex2 k = - (* Converts k to a 2-digit hex string *) - let s = Bytes.create 2 in - Bytes.set s 0 hex_digits.((k lsr 4) land 15); - Bytes.set s 1 hex_digits.(k land 15); - Bytes.unsafe_to_string s - - let of_hex1 c = - match c with - | '0' .. '9' -> Char.code c - Char.code '0' - | 'A' .. 'F' -> Char.code c - Char.code 'A' + 10 - | 'a' .. 'f' -> Char.code c - Char.code 'a' + 10 - | _ -> raise Not_found - - let url_encoding_re = Netstring_str.regexp "[^A-Za-z0-9_.!*-]" - let url_decoding_re = Netstring_str.regexp "\\+\\|%..\\|%.\\|%" - - let encode ?(plus = true) s = - Netstring_str.global_substitute url_encoding_re - (fun r _ -> - match Netstring_str.matched_string r s with - | " " when plus -> "+" - | x -> - let k = Char.code x.[0] in - "%" ^ to_hex2 k) - s - - let decode ?(plus = true) ?(pos = 0) ?len s = - let s_l = String.length s in - let s1 = - if pos = 0 && len = None then s - else - let len = match len with Some n -> n | None -> s_l in - String.sub s pos len - in - let l = String.length s1 in - Netstring_str.global_substitute url_decoding_re - (fun r _ -> - match Netstring_str.matched_string r s1 with - | "+" -> if plus then " " else "+" - | _ -> ( - let i = Netstring_str.match_beginning r in - (* Assertion: s1.[i] = '%' *) - if i + 2 >= l then failwith "Web.Url.decode"; - let c1 = s1.[i + 1] in - let c2 = s1.[i + 2] in - try - let k1 = of_hex1 c1 in - let k2 = of_hex1 c2 in - String.make 1 (Char.chr ((k1 lsl 4) lor k2)) - with Not_found -> failwith "Web.Url.decode")) - s1 - - let url_split_re = Netstring_str.regexp "[&=]" - - let dest_url_encoded_parameters parstr = - let rec parse_after_amp tl = - match tl with - | Netstring_str.Text name - :: Netstring_str.Delim "=" - :: Netstring_str.Text value - :: tl' -> - (decode name, decode value) :: parse_next tl' - | Netstring_str.Text name - :: Netstring_str.Delim "=" - :: Netstring_str.Delim "&" - :: tl' -> - (decode name, "") :: parse_after_amp tl' - | [ Netstring_str.Text name; Netstring_str.Delim "=" ] -> - [ (decode name, "") ] - | _ -> failwith "Web.Url.dest_url_encoded_parameters" - and parse_next tl = - match tl with - | [] -> [] - | Netstring_str.Delim "&" :: tl' -> parse_after_amp tl' - | _ -> failwith "Web.Url.dest_url_encoded_parameters" - in - let toklist = Netstring_str.full_split url_split_re parstr in - match toklist with [] -> [] | _ -> parse_after_amp toklist - -end - (** percent-encode (convert space into %20) *) let rawurlencode = Url.encode ~plus:false From 380fa875db9c55ab7ffa1a300adce1d5a2bcf091 Mon Sep 17 00:00:00 2001 From: Javier Chavarri Date: Tue, 12 Dec 2023 11:30:04 +0000 Subject: [PATCH 3/8] add netencoding.html --- httpev.ml | 2 +- ocamlnet_lite/netaux.ml | 53 + ocamlnet_lite/netaux.mli | 31 + ocamlnet_lite/netbuffer.ml | 88 ++ ocamlnet_lite/netbuffer.mli | 19 + ocamlnet_lite/netconversion.ml | 2084 +++++++++++++++++++++++++++ ocamlnet_lite/netconversion.mli | 504 +++++++ ocamlnet_lite/netdb.ml | 34 + ocamlnet_lite/netdb.mli | 38 + ocamlnet_lite/netencoding.ml | 661 +++++++++ ocamlnet_lite/netencoding.mli | 137 ++ ocamlnet_lite/netmappings.ml | 39 + ocamlnet_lite/netmappings.mli | 46 + ocamlnet_lite/netstring_str.ml | 50 +- ocamlnet_lite/netstring_str.mli | 24 + ocamlnet_lite/netstring_tstring.ml | 171 +++ ocamlnet_lite/netstring_tstring.mli | 82 ++ ocamlnet_lite/netsys_types.ml | 32 + ocamlnet_lite/netsys_types.mli | 32 + ocamlnet_lite/url.ml | 100 -- ocamlnet_lite/url.mli | 39 - web.ml | 8 +- 22 files changed, 4125 insertions(+), 149 deletions(-) create mode 100644 ocamlnet_lite/netaux.ml create mode 100644 ocamlnet_lite/netaux.mli create mode 100644 ocamlnet_lite/netbuffer.ml create mode 100644 ocamlnet_lite/netbuffer.mli create mode 100644 ocamlnet_lite/netconversion.ml create mode 100644 ocamlnet_lite/netconversion.mli create mode 100644 ocamlnet_lite/netdb.ml create mode 100644 ocamlnet_lite/netdb.mli create mode 100644 ocamlnet_lite/netencoding.ml create mode 100644 ocamlnet_lite/netencoding.mli create mode 100644 ocamlnet_lite/netmappings.ml create mode 100644 ocamlnet_lite/netmappings.mli create mode 100644 ocamlnet_lite/netstring_tstring.ml create mode 100644 ocamlnet_lite/netstring_tstring.mli create mode 100644 ocamlnet_lite/netsys_types.ml create mode 100644 ocamlnet_lite/netsys_types.mli delete mode 100644 ocamlnet_lite/url.ml delete mode 100644 ocamlnet_lite/url.mli diff --git a/httpev.ml b/httpev.ml index ed668a9..948cd92 100644 --- a/httpev.ml +++ b/httpev.ml @@ -184,7 +184,7 @@ let get_content_length headers = | Some s -> try Some (int_of_string s) with _ -> failed Header (sprintf "content-length %S" s) let decode_args s = - try Web.Url.dest_url_encoded_parameters s with exn -> Exn.fail ~exn "decode_args : %S" s + try Ocamlnet_lite.Netencoding.Url.dest_url_encoded_parameters s with exn -> Exn.fail ~exn "decode_args : %S" s let acceptable_encoding headers = let split s c = List.map (String.strip ~chars:" \t\r\n") @@ Stre.nsplitc s c in diff --git a/ocamlnet_lite/netaux.ml b/ocamlnet_lite/netaux.ml new file mode 100644 index 0000000..a7a307f --- /dev/null +++ b/ocamlnet_lite/netaux.ml @@ -0,0 +1,53 @@ +module ArrayAux = struct + let int_blit_ref = + ref + (fun (src:int array) srcpos dest destpos len -> + (* A specialised version of Array.blit for int arrays. + * Faster than the polymorphic Array.blit for + * various reasons. + *) + if (len < 0 || srcpos < 0 || + srcpos+len > Array.length src || + destpos < 0 || + destpos+len > Array.length dest) then + invalid_arg "Netaux.ArrayAux.int_blit"; + if src != dest || destpos <= srcpos then ( + for i = 0 to len-1 do + Array.unsafe_set + dest + (destpos+i) + (Array.unsafe_get src (srcpos+i)) + done + ) else ( + for i = len-1 downto 0 do + Array.unsafe_set + dest + (destpos+i) + (Array.unsafe_get src (srcpos+i)) + done + ) + ) + + let int_blit src srcpos dest destpos len = + !int_blit_ref src srcpos dest destpos len + + let int_series_ref = + ref + (fun src srcpos dst dstpos len n -> + if (len < 0 || srcpos < 0 || dstpos < 0 || + srcpos+len > Array.length src || + dstpos+len > Array.length dst) + then + invalid_arg "Netaux.ArrayAux.int_series"; + + let s = ref n in + for i = 0 to len-1 do + Array.unsafe_set dst (dstpos+i) !s; + s := !s + Array.unsafe_get src (srcpos+i) + done + ) + + let int_series src srcpos dst dstpos len n = + !int_series_ref src srcpos dst dstpos len n + +end diff --git a/ocamlnet_lite/netaux.mli b/ocamlnet_lite/netaux.mli new file mode 100644 index 0000000..69934c8 --- /dev/null +++ b/ocamlnet_lite/netaux.mli @@ -0,0 +1,31 @@ +(** Internal auxiliary functions + * + * This is an internal module. + *) + +(* Auxiliary stuff *) + +module ArrayAux : sig + val int_blit : int array -> int -> int array -> int -> int -> unit + (** A specialisation of [Array.blit] for int arrays. + * (Performance reasons.) + *) + + val int_series : int array -> int -> int array -> int -> int -> int -> unit + (** [int_series src srcpos dst dstpos len n]: + * Computes for every [i], [0 <= i < len]: + * [dst.(dstpos+i) = n + SUM(j=0..(i-1): src.(srcpos+j)) ] + * + * It is expected that [src == dst] implies [srcpos >= dstpos]. + *) + + (**/**) + + val int_blit_ref : + (int array -> int -> int array -> int -> int -> unit) ref + (* Used by [Netaccel] to override the built-in implementation *) + + val int_series_ref : + (int array -> int -> int array -> int -> int -> int -> unit) ref + (* Used by [Netaccel] to override the built-in implementation *) +end diff --git a/ocamlnet_lite/netbuffer.ml b/ocamlnet_lite/netbuffer.ml new file mode 100644 index 0000000..a5539e9 --- /dev/null +++ b/ocamlnet_lite/netbuffer.ml @@ -0,0 +1,88 @@ +type t = { + mutable buffer : Bytes.t; + mutable buffer_length : int; (* = String.length buffer *) + mutable length : int; + create_length : int; +} + +(* To help the garbage collector: + * The 'buffer' has a minimum length of 31 bytes. This minimum can still + * be stored in the minor heap. + * The 'buffer' has a length which is always near a multiple of two. This + * limits the number of different bucket sizes, and simplifies reallocation + * of freed memory. + *) + +(* Optimal string length: + * Every string takes: 1 word for the header, enough words for the + * contents + 1 Null byte (for C compatibility). + * If the buffer grows, it is best to use a new string length such + * that the number of words is exactly twice as large as for the previous + * string. + * n: length of the previous string in bytes + * w: storage size of the previous string in words + * n': length of the new string in bytes + * w' = 2*w: storage size of the new string in words + * + * w = (n+1) / word_length + 1 + * [it is assumed that (n+1) is always a multiple of word_length] + * + * n' = (2*w - 1) * word_length - 1 + * + * n' = [2 * ( [n+1] / word_length + 1) - 1] * word_length - 1 + * = ... + * = (2*n + 2) + word_length - 1 + * = 2 * n + word_length + 1 + * + * n'+1 is again a multiple of word_length: + * n'+1 = 2*n + 2 + word_length + * = 2*(n+1) + word_length + * = a multiple of word_length because n+1 is a multiple of word_length + *) + +let word_length = Sys.word_size / 8 (* in bytes *) + +let create n = + let bl = max n 31 in + { + buffer = Bytes.create bl; + buffer_length = bl; + length = 0; + create_length = n; + } + +let contents b = Bytes.sub_string b.buffer 0 b.length +let to_bytes b = Bytes.sub b.buffer 0 b.length + +let to_tstring_poly : type s. t -> s Netstring_tstring.tstring_kind -> s = + fun b kind -> + match kind with + | Netstring_tstring.String_kind -> contents b + | Netstring_tstring.Bytes_kind -> to_bytes b + +let alloc_space b n = + let rec new_size s = + if s >= n then s else new_size ((2 * s) + word_length + 1) + in + let size = min (new_size b.buffer_length) Sys.max_string_length in + if size < n then failwith "Netbuffer: string too large"; + let buffer' = Bytes.create size in + Bytes.blit b.buffer 0 buffer' 0 b.length; + b.buffer <- buffer'; + b.buffer_length <- size + +let ensure_space b n = + (* Ensure that there are n bytes space in b *) + if n > b.buffer_length then alloc_space b n + +let add_internal blit b s k l = + ensure_space b (l + b.length); + blit s k b.buffer b.length l; + b.length <- b.length + l + +let add_substring b s k l = + if k < 0 || l < 0 || k > String.length s - l then + invalid_arg "Netbuffer.add_substring"; + add_internal Bytes.blit_string b s k l + +let add_string b s = add_substring b s 0 (String.length s) diff --git a/ocamlnet_lite/netbuffer.mli b/ocamlnet_lite/netbuffer.mli new file mode 100644 index 0000000..9a9afc5 --- /dev/null +++ b/ocamlnet_lite/netbuffer.mli @@ -0,0 +1,19 @@ +(** A Netbuffer.t is a buffer that can grow and shrink dynamically. *) + +type t + +val create : int -> t + (** Creates a netbuffer which allocates initially this number of bytes. + * The logical length is zero. + *) + +val to_tstring_poly : t -> 's Netstring_tstring.tstring_kind -> 's + (** Return the buffer in the format as selected by the arg *) + +(** {2 Appending strings} *) + +val add_string : t -> string -> unit + (** [add_string nb s]: Adds a copy of the string [s] to the logical end of + * the netbuffer [nb]. If necessary, [nb] grows. + *) + diff --git a/ocamlnet_lite/netconversion.ml b/ocamlnet_lite/netconversion.ml new file mode 100644 index 0000000..c07e318 --- /dev/null +++ b/ocamlnet_lite/netconversion.ml @@ -0,0 +1,2084 @@ +open Netaux.ArrayAux + +exception Malformed_code +exception Cannot_represent of int + +let multibyte_limit = (* 6 *) 50 + +(* The longest multibyte character of all supported encodings, + * and the longest substitution string. + *) + +let big_slice = (* 3 *) 250 + +(* The typical length of slices *) + +(* Seems to be a good source: ftp://dkuug.dk/i18n/charmaps + *) + +type encoding = + [ `Enc_utf8 (* UTF-8 *) + | `Enc_utf8_opt_bom + | `Enc_java + | `Enc_utf16 (* UTF-16 with unspecified endianess (restricted usage) *) + | `Enc_utf16_le (* UTF-16 little endian *) + | `Enc_utf16_be (* UTF-16 big endian *) + | `Enc_utf32 (* UTF-32 with unspecified endianess (restricted usage) *) + | `Enc_utf32_le (* UTF-32 little endian *) + | `Enc_utf32_be (* UTF-32 big endian *) + | `Enc_usascii (* US-ASCII (only 7 bit) *) + | `Enc_iso88591 (* ISO-8859-1 *) + | `Enc_iso88592 (* ISO-8859-2 *) + | `Enc_iso88593 (* ISO-8859-3 *) + | `Enc_iso88594 (* ISO-8859-4 *) + | `Enc_iso88595 (* ISO-8859-5 *) + | `Enc_iso88596 (* ISO-8859-6 *) + | `Enc_iso88597 (* ISO-8859-7 *) + | `Enc_iso88598 (* ISO-8859-8 *) + | `Enc_iso88599 (* ISO-8859-9 *) + | `Enc_iso885910 (* ISO-8859-10 *) + | `Enc_iso885911 (* ISO-8859-11 *) + | `Enc_iso885913 (* ISO-8859-13 *) + | `Enc_iso885914 (* ISO-8859-14 *) + | `Enc_iso885915 (* ISO-8859-15 *) + | `Enc_iso885916 (* ISO-8859-16 *) + | `Enc_koi8r + (* KOI8-R *) + (* http://koi8.pp.ru *) + | (*| `Enc_koi8u (* KOI8-U *) (* http://www.net.ua/KOI8-U/index.html *)*) + `Enc_jis0201 + (* JIS-X-0201 *) + | (* + | `Enc_jis0201_roman (* JIS-X-0201 only roman half *) + | `Enc_jis0201_kana (* JIS-X-0201 katakana half remapped to 0x21..XXX *) + | `Enc_jis0208_94x94 (* JIS-X-0208 in ISO-2022-style two byte encoding *) + | `Enc_jis0212_94x94 (* JIS-X-0212 in ISO-2022-style two byte encoding *) + *) + `Enc_eucjp + (* EUC-JP *) + | `Enc_euckr (* EUC-KR *) + | (* + | `Enc_iso2022 of iso2022_state + | `Enc_iso2022jp of iso2022jp_state + *) + (* Older standards: *) + `Enc_asn1_iso646 + (* only the language-neutral subset *) + | `Enc_asn1_T61 (* ITU T.61 ("Teletex") *) + | `Enc_asn1_printable + | (* Microsoft: *) + `Enc_windows1250 (* WINDOWS-1250 *) + | `Enc_windows1251 (* WINDOWS-1251 *) + | `Enc_windows1252 (* WINDOWS-1252 *) + | `Enc_windows1253 (* WINDOWS-1253 *) + | `Enc_windows1254 (* WINDOWS-1254 *) + | `Enc_windows1255 (* WINDOWS-1255 *) + | `Enc_windows1256 (* WINDOWS-1256 *) + | `Enc_windows1257 (* WINDOWS-1257 *) + | `Enc_windows1258 (* WINDOWS-1258 *) + | (* IBM, ASCII-based: *) + `Enc_cp437 + | `Enc_cp737 + | `Enc_cp775 + | `Enc_cp850 + | `Enc_cp852 + | `Enc_cp855 + | `Enc_cp856 + | `Enc_cp857 + | `Enc_cp860 + | `Enc_cp861 + | `Enc_cp862 + | `Enc_cp863 + | `Enc_cp864 + | `Enc_cp865 + | `Enc_cp866 (* Russian *) + | `Enc_cp869 + | `Enc_cp874 + | `Enc_cp1006 + | (* IBM, EBCDIC-based: *) + `Enc_cp037 + (* EBCDIC USA Canada *) + (* 273: EBCDIC Germany, Austria, + * 277: Denmark, Norway, + * 278: Finland, Sweden, + * 280: Italy, + * 284: Spain, Latin America, + * 285: United Kingdom, + * 297: France, + * 871: Iceland, + *) + | `Enc_cp424 + | `Enc_cp500 (* EBCDIC International *) + | `Enc_cp875 (* EBCDIC Modern Greek *) + | `Enc_cp1026 (* EBCDIC Turkish *) + | `Enc_cp1047 (* EBCDIC Latin1, OS 390 System Services *) + | (* Adobe: *) + `Enc_adobe_standard_encoding + | `Enc_adobe_symbol_encoding + | `Enc_adobe_zapf_dingbats_encoding + | (* Apple: *) + `Enc_macroman + | (* Encoding subset: *) + `Enc_subset of encoding * (int -> bool) + | `Enc_empty ] + +type charset = + [ `Set_unicode (* The full Unicode repertoire *) + | `Set_usascii (* US-ASCII (only 7 bit) *) + | `Set_iso88591 (* ISO-8859-1 *) + | `Set_iso88592 (* ISO-8859-2 *) + | `Set_iso88593 (* ISO-8859-3 *) + | `Set_iso88594 (* ISO-8859-4 *) + | `Set_iso88595 (* ISO-8859-5 *) + | `Set_iso88596 (* ISO-8859-6 *) + | `Set_iso88597 (* ISO-8859-7 *) + | `Set_iso88598 (* ISO-8859-8 *) + | `Set_iso88599 (* ISO-8859-9 *) + | `Set_iso885910 (* ISO-8859-10 *) + | `Set_iso885911 (* ISO-8859-11 *) + | `Set_iso885913 (* ISO-8859-13 *) + | `Set_iso885914 (* ISO-8859-14 *) + | `Set_iso885915 (* ISO-8859-15 *) + | `Set_iso885916 (* ISO-8859-16 *) + | `Set_koi8r (* KOI8-R *) + | `Set_jis0201 (* JIS-X-0201 *) + | `Set_jis0208 (* JIS-X-0208 *) + | `Set_jis0212 (* JIS-X-0212 *) + | `Set_ks1001 (* KS-X-1001 *) + | `Set_asn1_iso646 + | `Set_asn1_T61 + | `Set_asn1_printable + | (* Microsoft: *) + `Set_windows1250 (* WINDOWS-1250 *) + | `Set_windows1251 (* WINDOWS-1251 *) + | `Set_windows1252 (* WINDOWS-1252 *) + | `Set_windows1253 (* WINDOWS-1253 *) + | `Set_windows1254 (* WINDOWS-1254 *) + | `Set_windows1255 (* WINDOWS-1255 *) + | `Set_windows1256 (* WINDOWS-1256 *) + | `Set_windows1257 (* WINDOWS-1257 *) + | `Set_windows1258 (* WINDOWS-1258 *) + | (* IBM, ASCII-based: *) + `Set_cp437 + | `Set_cp737 + | `Set_cp775 + | `Set_cp850 + | `Set_cp852 + | `Set_cp855 + | `Set_cp856 + | `Set_cp857 + | `Set_cp860 + | `Set_cp861 + | `Set_cp862 + | `Set_cp863 + | `Set_cp864 + | `Set_cp865 + | `Set_cp866 + | `Set_cp869 + | `Set_cp874 + | `Set_cp1006 + | (* IBM, EBCDIC-based: *) + `Set_cp037 + | `Set_cp424 + | `Set_cp500 + | `Set_cp875 + | `Set_cp1026 + | `Set_cp1047 + | (* Adobe: *) + `Set_adobe_standard_encoding + | `Set_adobe_symbol_encoding + | `Set_adobe_zapf_dingbats_encoding + | (* Apple: *) + `Set_macroman ] + +let ascii_compat_encodings = + [ + `Enc_utf8; + `Enc_utf8_opt_bom; + `Enc_java; + `Enc_usascii; + `Enc_iso88591; + `Enc_iso88592; + `Enc_iso88593; + `Enc_iso88594; + `Enc_iso88595; + `Enc_iso88596; + `Enc_iso88597; + `Enc_iso88598; + `Enc_iso88599; + `Enc_iso885910; + `Enc_iso885911; + `Enc_iso885913; + `Enc_iso885914; + `Enc_iso885915; + `Enc_iso885916; + `Enc_koi8r; + `Enc_windows1250; + `Enc_windows1251; + `Enc_windows1252; + `Enc_windows1253; + `Enc_windows1254; + `Enc_windows1255; + `Enc_windows1256; + `Enc_windows1257; + `Enc_windows1258; + `Enc_cp437; + `Enc_cp737; + `Enc_cp775; + `Enc_cp850; + `Enc_cp852; + `Enc_cp855; + `Enc_cp856; + `Enc_cp857; + `Enc_cp860; + `Enc_cp861; + `Enc_cp862; + `Enc_cp863; + `Enc_cp864; + `Enc_cp865; + `Enc_cp866; + `Enc_cp869; + `Enc_cp874; + `Enc_cp1006; + `Enc_eucjp; + `Enc_euckr; + `Enc_macroman; + ] + +let rec is_ascii_compatible = function + | `Enc_subset (e, _) -> is_ascii_compatible e + | e -> List.mem e ascii_compat_encodings + +let rec is_single_byte = function + | `Enc_utf8 | `Enc_utf8_opt_bom | `Enc_java | `Enc_utf16 | `Enc_utf16_le + | `Enc_utf16_be | `Enc_utf32 | `Enc_utf32_le | `Enc_utf32_be -> + false + | `Enc_eucjp -> false + | `Enc_euckr -> false + | `Enc_subset (e, _) -> is_single_byte e + | _ -> true + +let internal_name (cs : charset) = + (* The name used for netdb lookups *) + match cs with + | `Set_unicode -> "unicode" + | `Set_usascii -> "usascii" + | `Set_iso88591 -> "iso88591" + | `Set_iso88592 -> "iso88592" + | `Set_iso88593 -> "iso88593" + | `Set_iso88594 -> "iso88594" + | `Set_iso88595 -> "iso88595" + | `Set_iso88596 -> "iso88596" + | `Set_iso88597 -> "iso88597" + | `Set_iso88598 -> "iso88598" + | `Set_iso88599 -> "iso88599" + | `Set_iso885910 -> "iso885910" + | `Set_iso885911 -> "iso885911" + | `Set_iso885913 -> "iso885913" + | `Set_iso885914 -> "iso885914" + | `Set_iso885915 -> "iso885915" + | `Set_iso885916 -> "iso885916" + | `Set_koi8r -> "koi8r" + | `Set_jis0201 -> "jis0201" + | `Set_jis0208 -> "jis0208" + | `Set_jis0212 -> "jis0212" + | `Set_ks1001 -> "ks1001" + | `Set_asn1_iso646 -> "asn1_iso646" + | `Set_asn1_T61 -> "asn1_t61" + | `Set_asn1_printable -> "asn1_printable" + | `Set_windows1250 -> "windows1250" + | `Set_windows1251 -> "windows1251" + | `Set_windows1252 -> "windows1252" + | `Set_windows1253 -> "windows1253" + | `Set_windows1254 -> "windows1254" + | `Set_windows1255 -> "windows1255" + | `Set_windows1256 -> "windows1256" + | `Set_windows1257 -> "windows1257" + | `Set_windows1258 -> "windows1258" + | `Set_cp437 -> "cp437" + | `Set_cp737 -> "cp737" + | `Set_cp775 -> "cp775" + | `Set_cp850 -> "cp850" + | `Set_cp852 -> "cp852" + | `Set_cp855 -> "cp855" + | `Set_cp856 -> "cp856" + | `Set_cp857 -> "cp857" + | `Set_cp860 -> "cp860" + | `Set_cp861 -> "cp861" + | `Set_cp862 -> "cp862" + | `Set_cp863 -> "cp863" + | `Set_cp864 -> "cp864" + | `Set_cp865 -> "cp865" + | `Set_cp866 -> "cp866" + | `Set_cp869 -> "cp869" + | `Set_cp874 -> "cp874" + | `Set_cp1006 -> "cp1006" + | `Set_cp037 -> "cp037" + | `Set_cp424 -> "cp424" + | `Set_cp500 -> "cp500" + | `Set_cp875 -> "cp875" + | `Set_cp1026 -> "cp1026" + | `Set_cp1047 -> "cp1047" + | `Set_adobe_standard_encoding -> "adobe_standard_encoding" + | `Set_adobe_symbol_encoding -> "adobe_symbol_encoding" + | `Set_adobe_zapf_dingbats_encoding -> "adobe_zapf_dingbats_encoding" + | `Set_macroman -> "macroman" + +let rec required_charsets (e : encoding) = + (* The name is a bit misleading. The function returns the charsets that + * correspond to the conversion tables that are required to support the + * encoding. + *) + match e with + | `Enc_utf8 | `Enc_utf8_opt_bom | `Enc_java | `Enc_utf16 | `Enc_utf16_le + | `Enc_utf16_be | `Enc_utf32 | `Enc_utf32_le | `Enc_utf32_be -> + [] + | `Enc_usascii -> [] + | `Enc_iso88591 -> [] + | `Enc_iso88592 -> [ `Set_iso88592 ] + | `Enc_iso88593 -> [ `Set_iso88593 ] + | `Enc_iso88594 -> [ `Set_iso88594 ] + | `Enc_iso88595 -> [ `Set_iso88595 ] + | `Enc_iso88596 -> [ `Set_iso88596 ] + | `Enc_iso88597 -> [ `Set_iso88597 ] + | `Enc_iso88598 -> [ `Set_iso88598 ] + | `Enc_iso88599 -> [ `Set_iso88599 ] + | `Enc_iso885910 -> [ `Set_iso885910 ] + | `Enc_iso885911 -> [ `Set_iso885911 ] + | `Enc_iso885913 -> [ `Set_iso885913 ] + | `Enc_iso885914 -> [ `Set_iso885914 ] + | `Enc_iso885915 -> [ `Set_iso885915 ] + | `Enc_iso885916 -> [ `Set_iso885916 ] + | `Enc_koi8r -> [ `Set_koi8r ] + | `Enc_jis0201 -> [ `Set_jis0201 ] + | `Enc_eucjp -> [ `Set_jis0201; `Set_jis0208; `Set_jis0212 ] + | `Enc_euckr -> [ `Set_ks1001 ] + | `Enc_asn1_iso646 -> [ `Set_asn1_iso646 ] + | `Enc_asn1_T61 -> [ `Set_asn1_T61 ] + | `Enc_asn1_printable -> [ `Set_asn1_printable ] + | `Enc_windows1250 -> [ `Set_windows1250 ] + | `Enc_windows1251 -> [ `Set_windows1251 ] + | `Enc_windows1252 -> [ `Set_windows1252 ] + | `Enc_windows1253 -> [ `Set_windows1253 ] + | `Enc_windows1254 -> [ `Set_windows1254 ] + | `Enc_windows1255 -> [ `Set_windows1255 ] + | `Enc_windows1256 -> [ `Set_windows1256 ] + | `Enc_windows1257 -> [ `Set_windows1257 ] + | `Enc_windows1258 -> [ `Set_windows1258 ] + | `Enc_cp437 -> [ `Set_cp437 ] + | `Enc_cp737 -> [ `Set_cp737 ] + | `Enc_cp775 -> [ `Set_cp775 ] + | `Enc_cp850 -> [ `Set_cp850 ] + | `Enc_cp852 -> [ `Set_cp852 ] + | `Enc_cp855 -> [ `Set_cp855 ] + | `Enc_cp856 -> [ `Set_cp856 ] + | `Enc_cp857 -> [ `Set_cp857 ] + | `Enc_cp860 -> [ `Set_cp860 ] + | `Enc_cp861 -> [ `Set_cp861 ] + | `Enc_cp862 -> [ `Set_cp862 ] + | `Enc_cp863 -> [ `Set_cp863 ] + | `Enc_cp864 -> [ `Set_cp864 ] + | `Enc_cp865 -> [ `Set_cp865 ] + | `Enc_cp866 -> [ `Set_cp866 ] + | `Enc_cp869 -> [ `Set_cp869 ] + | `Enc_cp874 -> [ `Set_cp874 ] + | `Enc_cp1006 -> [ `Set_cp1006 ] + | `Enc_cp037 -> [ `Set_cp037 ] + | `Enc_cp424 -> [ `Set_cp424 ] + | `Enc_cp500 -> [ `Set_cp500 ] + | `Enc_cp875 -> [ `Set_cp875 ] + | `Enc_cp1026 -> [ `Set_cp1026 ] + | `Enc_cp1047 -> [ `Set_cp1047 ] + | `Enc_adobe_standard_encoding -> [ `Set_adobe_standard_encoding ] + | `Enc_adobe_symbol_encoding -> [ `Set_adobe_symbol_encoding ] + | `Enc_adobe_zapf_dingbats_encoding -> [ `Set_adobe_zapf_dingbats_encoding ] + | `Enc_macroman -> [ `Set_macroman ] + | `Enc_subset (e', _) -> required_charsets e' + | `Enc_empty -> [] + +(* Internal conversion interface: + * + * let (n_char, n_byte, enc') = read_XXX slice_char slice_blen s_in p_in l_in: + * + * - Scans the bytes from position p_in until the slice is decoded, but at + * most until the last position p_in+l_in-1 of the input string s_in, and + * decodes the character for the selected encoding. + * - "slice_char" is a preallocated array of ints storing the code points + * of the characters. It is allowed that "slice_char" is only partially + * filled with characters. In this case, there must be a -1 after the + * last valid code point. + * - "slice_blen" is another "int array" with the same size as "slice_char". + * It contains the byte length of every character. It is initialized with + * a sequence of ones, so single-byte readers don't have to worry about + * this array. + * - Returns: + * * n_char: the number of decoded characters + * * n_byte: the number of scanned bytes ( <= l_in ) + * * enc': the new encoding + * - In the case of multi-byte encodings it is possible that + * the last byte to read at position p_in+l_in-1 is the beginning of + * a character. This character is excluded from being decoded. + * - Errors: If an invalid byte sequence is found, the exception + * Malformed_code_read(_,_,_) is raised. The exception returns the + * triple (n_char, n_byte, enc') describing how much could be read + * before the reader ran into the bad sequence. slice_char and slice_blen + * are only partially initialized, with a (-1) at the end of slice_char. + * + * let (n_char, n_byte) = + * write_XXX slice_char slice_pos slice_length s_out p_out l_out subst + * + * - Writes the characters found in slice_char to s_out. Only the elements + * from slice_pos to slice_pos + slice_length -1 are written. The resulting + * bytes are written to s_out from byte position p_out to p_out+l_out-1. + * - There must not be a -1 (EOF mark) in the first slice_length characters + * of slice_char. + * - Only whole characters must be written. + * - For code points p that cannot be represented in the output + * encoding, the function subst is called. The function must return + * the (already encoded) string to substitute. This must be a small string. + * - Of course, p >= 0. As special case, p >= 0x110000 may be used to force + * that subst is called (it is assumed that max_int can be never + * represented). + * - Returns: + * * n_char: the number of processed characters + * * n_byte: the number of written bytes ( <= l_in ) + * + * let (n_char, n_byte) = + * back_XXX s_in range_in range_in_len p_in n_char: + * + * - The substring of s_in beginning at range_in and with length + * range_in_len is considered as the valid range + * - The cursor is at byte position p_in and goes n_char characters back + * - The routine returns: + * * n_char: the characters the cursor was actually moved backwards + * * n_byte: the bytes the cursor was actually moved backwards + * - The validity of the input encoding needs not to be checked + *) + +exception Malformed_code_read of (int * int * encoding);; + +(* not exported! *) + +Callback.register_exception "Netconversion.Malformed_code_read" + (Malformed_code_read (0, 0, `Enc_empty)) + +(* Needed by netaccel_c.c *) + +(* UNSAFE_OPT: A number of functions have been optimized by using + * unsafe features of O'Caml (unsafe_get, unsafe_set, unsafe_chr). + * These functions have been checked very carefully, and there are + * a lot of comments arguing about the correctness of the array + * and string accesses. + *) + +type poly_reader = { + read : + 's. + 's Netstring_tstring.tstring_ops -> + int array -> + int array -> + 's -> + int -> + int -> + int * int * encoding; +} + +let read_iso88591 maxcode enc = + (* UNSAFE_OPT *) + let read ops slice_char slice_blen s_in p_in l_in = + let open Netstring_tstring in + assert (Array.length slice_char = Array.length slice_blen); + assert (p_in >= 0 && p_in + l_in <= ops.length s_in && l_in >= 0); + let m = min l_in (Array.length slice_char) in + let m3 = m / 3 in + for k3 = 0 to m3 - 1 do + let k = 3 * k3 in + (* let ch = Char.code s_in.[ p_in + k ] in *) + let chars = ops.unsafe_get3 s_in (p_in + k) in + let c0 = chars lsr 16 in + let c1 = (chars lsr 8) land 0xff in + let c2 = chars land 0xff in + if c0 > maxcode then ( + slice_char.(k) <- -1; + raise (Malformed_code_read (k, k, enc))); + Array.unsafe_set slice_char k c0; + if c1 > maxcode then ( + slice_char.(k + 1) <- -1; + raise (Malformed_code_read (k + 1, k + 1, enc))); + Array.unsafe_set slice_char (k + 1) c1; + if c2 > maxcode then ( + slice_char.(k + 2) <- -1; + raise (Malformed_code_read (k + 2, k + 2, enc))); + Array.unsafe_set slice_char (k + 2) c2 + done; + for k = 3 * m3 to m - 1 do + let c0 = Char.code (ops.unsafe_get s_in (p_in + k)) in + if c0 > maxcode then ( + slice_char.(k) <- -1; + raise (Malformed_code_read (k, k, enc))); + Array.unsafe_set slice_char k c0 + done; + if m < Array.length slice_char then slice_char.(m) <- -1; + (m, m, enc) + in + { read } + +let read_iso88591_ref = ref read_iso88591 + +let get_8bit_to_unicode_map enc = + let cs = + match required_charsets enc with + | [ cs ] -> cs + | _ -> failwith "get_8bit_to_unicode_map" + in + let to_unicode = Netmappings.get_to_unicode (internal_name cs) in + assert (Array.length to_unicode = 256); + to_unicode + +let read_8bit enc = + let m_to_unicode = get_8bit_to_unicode_map enc in + + (* the 256-byte array mapping the character set to unicode *) + let read ops slice_char slice_blen s_in p_in l_in = + (* UNSAFE_OPT *) + let open Netstring_tstring in + assert (Array.length slice_char = Array.length slice_blen); + assert (p_in >= 0 && p_in + l_in <= ops.length s_in && l_in >= 0); + let m = min l_in (Array.length slice_char) in + let m3 = m / 3 in + for k3 = 0 to m3 - 1 do + let k = 3 * k3 in + let chars = ops.unsafe_get3 s_in k in + let c0 = chars lsr 16 in + let c1 = (chars lsr 8) land 0xff in + let c2 = chars land 0xff in + let c0_uni = Array.unsafe_get m_to_unicode c0 in + if c0_uni < 0 then ( + slice_char.(k) <- -1; + raise (Malformed_code_read (k, k, enc))); + Array.unsafe_set slice_char k c0_uni; + let c1_uni = Array.unsafe_get m_to_unicode c1 in + if c1_uni < 0 then ( + slice_char.(k + 1) <- -1; + raise (Malformed_code_read (k + 1, k + 1, enc))); + Array.unsafe_set slice_char (k + 1) c1_uni; + let c2_uni = Array.unsafe_get m_to_unicode c2 in + if c2_uni < 0 then ( + slice_char.(k + 2) <- -1; + raise (Malformed_code_read (k + 2, k + 2, enc))); + Array.unsafe_set slice_char (k + 2) c2_uni + done; + for k = 3 * m3 to m - 1 do + let c0 = Char.code (ops.get s_in k) in + let c0_uni = Array.unsafe_get m_to_unicode c0 in + if c0_uni < 0 then ( + slice_char.(k) <- -1; + raise (Malformed_code_read (k, k, enc))); + Array.unsafe_set slice_char k c0_uni + done; + if m < Array.length slice_char then slice_char.(m) <- -1; + (m, m, enc) + in + { read } + +let read_utf8 is_java = + (* UNSAFE_OPT *) + let read ops slice_char slice_blen s_in p_in l_in = + let open Netstring_tstring in + assert (Array.length slice_char = Array.length slice_blen); + assert (p_in >= 0 && p_in + l_in <= ops.length s_in && l_in >= 0); + + (* k: counts the bytes + * n: counts the characters + *) + let p = ref p_in in + let p_max = p_in + l_in in + let n = ref 0 in + let n_ret = ref (-1) in + + let malformed_code () = + slice_char.(!n) <- -1; + raise (Malformed_code_read (!n, !p - p_in, `Enc_utf8)) + in + + let slice_length = Array.length slice_char in + + while !p < p_max && !n < slice_length do + let k_inc = + (* length of the character in bytes; 0 means: stop *) + + (* We know: + * (1) p_in >= 0 ==> !p >= 0 + * (2) !p < p_max = p_in + l_in <= String.length s_in + * ==> unsafe get ok + *) + (* match s_in.[k_in + k] with *) + match ops.unsafe_get s_in !p with + | '\000' -> + if is_java then malformed_code (); + (* slice_char.(n) <- 0; *) + Array.unsafe_set slice_char !n 0; + (* ok *) + 1 + | '\001' .. '\127' as x -> + (* slice_char.(n) <- Char.code x; *) + Array.unsafe_set slice_char !n (Char.code x); + (* ok *) + 1 + | '\128' .. '\223' as x -> + if !p + 1 >= p_max then 0 + else + (* ==> !p+1 < p_max = p_in + l_in <= String.length s_in + * ==> unsafe get ok + *) + let n1 = Char.code x in + let n2 = + (* Char.code (s_in.[!p + 1]) *) + Char.code (ops.unsafe_get s_in (!p + 1)) + in + if is_java && n1 = 0x80 && n2 = 0xc0 then ( + (* slice_char.(n) <- 0; *) + Array.unsafe_set slice_char !n 0; + (* ok *) + 2) + else ( + if n2 < 128 || n2 > 191 then malformed_code (); + let p = ((n1 land 0b11111) lsl 6) lor (n2 land 0b111111) in + if p < 128 then malformed_code (); + (* slice_char.(n) <- p; *) + Array.unsafe_set slice_char !n p; + (* ok *) + 2) + | '\224' .. '\239' as x -> + if !p + 2 >= p_max then 0 + else + (* ==> !p+2 < p_max = p_in + l_in <= String.length s_in + * ==> unsafe get ok + *) + let n1 = Char.code x in + let n2 = + (* Char.code (s_in.[!p + 1]) *) + Char.code (ops.unsafe_get s_in (!p + 1)) + in + let n3 = + (* Char.code (s_in.[!p + 2]) *) + Char.code (ops.unsafe_get s_in (!p + 2)) + in + if n2 < 128 || n2 > 191 then malformed_code (); + if n3 < 128 || n3 > 191 then malformed_code (); + let p = + ((n1 land 0b1111) lsl 12) + lor ((n2 land 0b111111) lsl 6) + lor (n3 land 0b111111) + in + if p < 0x800 then malformed_code (); + if p >= 0xd800 && p < 0xe000 then + (* Surrogate pairs are not supported in UTF-8 *) + malformed_code (); + if p >= 0xfffe && p <= 0xffff then malformed_code (); + (* slice_char.(n) <- p; *) + Array.unsafe_set slice_char !n p; + (* ok *) + 3 + | '\240' .. '\247' as x -> + if !p + 3 >= p_max then 0 + else + (* ==> !p+3 < p_max = p_in + l_in <= String.length s_in + * ==> unsafe get ok + *) + let n1 = Char.code x in + let chars = ops.unsafe_get3 s_in (!p + 1) in + let n2 = chars lsr 16 in + let n3 = (chars lsr 8) land 0xff in + let n4 = chars land 0xff in + if n2 < 128 || n2 > 191 then malformed_code (); + if n3 < 128 || n3 > 191 then malformed_code (); + if n4 < 128 || n4 > 191 then malformed_code (); + let p = + ((n1 land 0b111) lsl 18) + lor ((n2 land 0b111111) lsl 12) + lor ((n3 land 0b111111) lsl 6) + lor (n4 land 0b111111) + in + if p < 0x10000 then malformed_code (); + if p >= 0x110000 then + (* These code points are not supported. *) + malformed_code (); + (* slice_char.(n) <- p; *) + Array.unsafe_set slice_char !n p; + (* ok *) + 4 + | _ -> + (* Outside the valid range of XML characters *) + malformed_code () + in + + (* If k_inc = 0, the character was partially outside the processed + * range of the string, and could not be decoded. + *) + if k_inc > 0 then ( + (* We know: + * (1) n >= 0, because n starts with 0 and is only increased + * (2) n < Array.length slice_char = Array.length slice_blen + * ==> unsafe set ok + *) + (* slice_blen.(n) <- k_inc; *) + Array.unsafe_set slice_blen !n k_inc; + (* next iteration: *) + p := !p + k_inc; + incr n) + else ( + (* Stop loop: *) + n_ret := !n; + n := slice_length) + done; + + if !n_ret = -1 then n_ret := !n; + if !n_ret < slice_length then (* EOF marker *) + slice_char.(!n_ret) <- -1; + (!n_ret, !p - p_in, `Enc_utf8) + in + { read } + +let read_utf8_ref = ref read_utf8 + +let have_utf8_bom ops s p = + let open Netstring_tstring in + let c0 = ops.get s (p + 0) in + let c1 = ops.get s (p + 1) in + let c2 = ops.get s (p + 2) in + c0 = '\xEF' && c1 = '\xBB' && c2 = '\xBF' + +let read_utf8_opt_bom expose_bom = + let read ops slice_char slice_blen s_in p_in l_in = + let open Netstring_tstring in + assert (Array.length slice_char = Array.length slice_blen); + assert (p_in >= 0 && p_in + l_in <= ops.length s_in && l_in >= 0); + (* Expect a BOM at the beginning of the text *) + if l_in >= 3 then + if have_utf8_bom ops s_in p_in then ( + let p_in1, l_in1 = + if expose_bom then (p_in, l_in) else (p_in + 3, l_in - 3) + in + let n_ret, p_ret, enc = + (!read_utf8_ref false).read ops slice_char slice_blen s_in p_in1 l_in1 + in + let p_ret1 = if expose_bom then p_ret else p_ret + 3 in + if expose_bom && n_ret >= 1 then slice_char.(0) <- -3; + (n_ret, p_ret1, enc)) + else (!read_utf8_ref false).read ops slice_char slice_blen s_in p_in l_in + else + let bom_possible = + l_in = 0 + || (l_in = 1 && ops.get s_in 0 = '\xEF') + || (l_in = 2 && ops.get s_in 0 = '\xEF' && ops.get s_in 1 = '\xBB') + in + if bom_possible then (0, 0, `Enc_utf8_opt_bom) + else (!read_utf8_ref false).read ops slice_char slice_blen s_in p_in l_in + in + { read } + +let surrogate_offset = 0x10000 - (0xD800 lsl 10) - 0xDC00 + +let read_utf16_lebe lo hi n_start enc = + (* lo=0, hi=1: little endian + * lo=1, hi=0: big endian + * n_start: First cell in slice to use + *) + let read ops slice_char slice_blen s_in p_in l_in = + let open Netstring_tstring in + assert (Array.length slice_char = Array.length slice_blen); + assert (p_in >= 0 && p_in + l_in <= ops.length s_in && l_in >= 0); + + let malformed_code k n = + slice_char.(n) <- -1; + raise (Malformed_code_read (n, k, enc)) + in + + (* k: counts the bytes + * n: counts the characters + *) + let rec put_loop k n = + if k + 1 < l_in && n < Array.length slice_char then + let p = + Char.code (ops.get s_in (p_in + k + lo)) + lor (Char.code (ops.get s_in (p_in + k + hi)) lsl 8) + in + + if p >= 0xd800 && p < 0xe000 then + (* This is a surrogate pair. *) + if k + 3 < l_in then + if p <= 0xdbff then ( + let q = + Char.code (ops.get s_in (p_in + k + 2 + lo)) + lor (Char.code (ops.get s_in (p_in + k + 2 + hi)) lsl 8) + in + if q < 0xdc00 || q > 0xdfff then malformed_code k n; + let eff_p = (p lsl 10) + q + surrogate_offset in + slice_char.(n) <- eff_p; + slice_blen.(n) <- 4; + put_loop (k + 4) (n + 1)) + else (* Malformed pair: *) + malformed_code k n + else (n, k) + else if (* Normal 2-byte character *) + p = 0xfffe then + (* Wrong byte order mark: It is illegal here *) + malformed_code k n + else ( + (* A regular code point *) + slice_char.(n) <- p; + slice_blen.(n) <- 2; + put_loop (k + 2) (n + 1)) + else (n, k) + in + let n, k = put_loop 0 n_start in + if n < Array.length slice_char then (* EOF marker *) + slice_char.(n) <- -1; + (n, k, enc) + in + { read } + +let get_endianess ops s_in p_in = + let open Netstring_tstring in + let c0 = ops.get s_in (p_in + 0) in + let c1 = ops.get s_in (p_in + 1) in + if c0 = '\254' && c1 = '\255' then `Big_endian + else if c0 = '\255' && c1 = '\254' then `Little_endian + else `No_BOM + +(* expose_bom: when true, the BOM is considered as a character and + * put as value (-3) into slice_char + *) + +let read_utf16 expose_bom = + let read ops slice_char slice_blen s_in p_in l_in = + let open Netstring_tstring in + assert (Array.length slice_char = Array.length slice_blen); + assert (p_in >= 0 && p_in + l_in <= ops.length s_in && l_in >= 0); + (* Expect a BOM at the beginning of the text *) + if l_in >= 2 then ( + if expose_bom then ( + slice_char.(0) <- -3; + slice_blen.(0) <- 0 (* Later corrected *)); + match get_endianess ops s_in p_in with + | `Big_endian -> + let n_start = if expose_bom then 1 else 0 in + let n, k, enc' = + (read_utf16_lebe 1 0 n_start `Enc_utf16_be).read ops slice_char + slice_blen s_in (p_in + 2) (l_in - 2) + in + if n > 0 then slice_blen.(0) <- slice_blen.(0) + 2; + (n, k + 2, enc') + | `Little_endian -> + let n_start = if expose_bom then 1 else 0 in + let n, k, enc' = + (read_utf16_lebe 0 1 n_start `Enc_utf16_le).read ops slice_char + slice_blen s_in (p_in + 2) (l_in - 2) + in + if n > 0 then slice_blen.(0) <- slice_blen.(0) + 2; + (n, k + 2, enc') + | `No_BOM -> + (* byte order mark missing *) + slice_char.(0) <- -1; + raise (Malformed_code_read (0, 0, `Enc_utf16))) + else ( + slice_char.(0) <- -1; + (0, 0, `Enc_utf16)) + in + { read } + +let read_utf32_lebe little n_start enc = + (* little: whether little endian + * n_start: First cell in slice to use + *) + let read ops slice_char slice_blen s_in p_in l_in = + let open Netstring_tstring in + assert (Array.length slice_char = Array.length slice_blen); + assert (p_in >= 0 && p_in + l_in <= ops.length s_in && l_in >= 0); + + let malformed_code k n = + slice_char.(n) <- -1; + raise (Malformed_code_read (n, k, enc)) + in + + let b0 = if little then 0 else 3 in + let b1 = if little then 1 else 2 in + let b2 = if little then 2 else 1 in + let b3 = if little then 3 else 0 in + + (* k: counts the bytes + * n: counts the characters + *) + let rec put_loop k n = + if k + 3 < l_in && n < Array.length slice_char then ( + let p3 = Char.code (ops.get s_in (p_in + k + b3)) in + if p3 <> 0 then malformed_code k n; + let p = + Char.code (ops.get s_in (p_in + k + b0)) + lor (Char.code (ops.get s_in (p_in + k + b1)) lsl 8) + lor (Char.code (ops.get s_in (p_in + k + b2)) lsl 16) + in + if (p >= 0xD800 && p <= 0xDFFF) || p >= 0x10FFFF then malformed_code k n; + if p = 0xfffe then + (* Wrong byte order mark: It is illegal here *) + malformed_code k n; + slice_char.(n) <- p; + slice_blen.(n) <- 4; + put_loop (k + 4) (n + 1)) + else (n, k) + in + let n, k = put_loop 0 n_start in + if n < Array.length slice_char then (* EOF marker *) + slice_char.(n) <- -1; + (n, k, enc) + in + { read } + +let get_endianess32 ops s_in p_in = + let open Netstring_tstring in + let c0 = ops.get s_in (p_in + 0) in + let c1 = ops.get s_in (p_in + 1) in + let c2 = ops.get s_in (p_in + 2) in + let c3 = ops.get s_in (p_in + 3) in + if c0 = '\000' && c1 = '\000' && c2 = '\254' && c3 = '\255' then `Big_endian + else if c0 = '\255' && c1 = '\254' && c2 = '\000' && c3 = '\000' then + `Little_endian + else `No_BOM + +let read_utf32 expose_bom = + let read ops slice_char slice_blen s_in p_in l_in = + let open Netstring_tstring in + assert (Array.length slice_char = Array.length slice_blen); + assert (p_in >= 0 && p_in + l_in <= ops.length s_in && l_in >= 0); + (* Expect a BOM at the beginning of the text *) + if l_in >= 4 then ( + if expose_bom then ( + slice_char.(0) <- -3; + slice_blen.(0) <- 0 (* Later corrected *)); + match get_endianess32 ops s_in p_in with + | `Big_endian -> + let n_start = if expose_bom then 1 else 0 in + let n, k, enc' = + (read_utf32_lebe false n_start `Enc_utf32_be).read ops slice_char + slice_blen s_in (p_in + 4) (l_in - 4) + in + if n > 0 then slice_blen.(0) <- slice_blen.(0) + 4; + (n, k + 4, enc') + | `Little_endian -> + let n_start = if expose_bom then 1 else 0 in + let n, k, enc' = + (read_utf32_lebe true n_start `Enc_utf32_le).read ops slice_char + slice_blen s_in (p_in + 4) (l_in - 4) + in + if n > 0 then slice_blen.(0) <- slice_blen.(0) + 4; + (n, k + 4, enc') + | `No_BOM -> + (* byte order mark missing *) + slice_char.(0) <- -1; + raise (Malformed_code_read (0, 0, `Enc_utf32))) + else ( + slice_char.(0) <- -1; + (0, 0, `Enc_utf32)) + in + { read } + +let read_euc len1 len2 len3 map1 map2 map3 enc = + (* Code set 0 is US-ASCII. + * Code sets 1, 2, 3 may be anything. lenX = 0: code set is not supported. + * lenX is either 0, 1, or 2. + *) + (* UNSAFE_OPT *) + let open Netstring_tstring in + assert (len1 >= 0 && len1 <= 2); + assert (len2 >= 0 && len2 <= 2); + assert (len3 >= 0 && len3 <= 2); + + let read ops slice_char slice_blen s_in p_in l_in = + assert (Array.length slice_char = Array.length slice_blen); + assert (p_in >= 0 && p_in + l_in <= ops.length s_in && l_in >= 0); + + (* k: counts the bytes + * n: counts the characters + *) + let p = ref p_in in + let p_max = p_in + l_in in + let n = ref 0 in + let n_ret = ref (-1) in + + let malformed_code () = + slice_char.(!n) <- -1; + raise (Malformed_code_read (!n, !p - p_in, enc)) + in + + let slice_length = Array.length slice_char in + + while !p < p_max && !n < slice_length do + let k_inc = + (* length of the character in bytes; 0 means: stop *) + + (* We know: + * (1) p_in >= 0 ==> !p >= 0 + * (2) !p < p_max = p_in + l_in <= String.length s_in + * ==> unsafe get ok + *) + (* match s_in.[k_in + k] with *) + match ops.unsafe_get s_in !p with + | '\000' .. '\127' as x -> + (* US-ASCII *) + Array.unsafe_set slice_char !n (Char.code x); + (* ok *) + 1 + | '\142' -> + (* Code set 2 *) + if len2 = 0 then malformed_code (); + if !p + len2 >= p_max then 0 + else + let x1 = Char.code (ops.get s_in (!p + 1)) in + let x2 = + if len2 = 1 then 256 else Char.code (ops.get s_in (!p + 2)) + in + if x1 < 160 || x2 < 160 then malformed_code (); + let uni = map2 x1 x2 in + Array.unsafe_set slice_char !n uni; + (* ok *) + len2 + 1 + | '\143' -> + (* Code set 3 *) + if len3 = 0 then malformed_code (); + if !p + len3 >= p_max then 0 + else + let x1 = Char.code (ops.get s_in (!p + 1)) in + let x2 = + if len3 = 1 then 256 else Char.code (ops.get s_in (!p + 2)) + in + if x1 < 160 || x2 < 160 then malformed_code (); + let uni = map3 x1 x2 in + Array.unsafe_set slice_char !n uni; + (* ok *) + len3 + 1 + | '\160' .. '\255' as x1_code -> + (* Code set 1 *) + if !p + len1 > p_max then 0 + else + let x1 = Char.code x1_code in + let x2 = + if len1 = 1 then 256 else Char.code (ops.get s_in (!p + 1)) + in + if x2 < 160 then malformed_code (); + let uni = map1 x1 x2 in + Array.unsafe_set slice_char !n uni; + (* ok *) + len1 + | _ -> + (* illegal *) + malformed_code () + in + + (* If k_inc = 0, the character was partially outside the processed + * range of the string, and could not be decoded. + *) + if k_inc > 0 then ( + (* We know: + * (1) n >= 0, because n starts with 0 and is only increased + * (2) n < Array.length slice_char = Array.length slice_blen + * ==> unsafe set ok + *) + (* slice_blen.(n) <- k_inc; *) + Array.unsafe_set slice_blen !n k_inc; + (* next iteration: *) + p := !p + k_inc; + incr n) + else ( + (* Stop loop: *) + n_ret := !n; + n := slice_length) + done; + + if !n_ret = -1 then n_ret := !n; + if !n_ret < slice_length then (* EOF marker *) + slice_char.(!n_ret) <- -1; + (!n_ret, !p - p_in, enc) + in + { read } + +let read_eucjp () = + let jis0201 = Netmappings.get_to_unicode "jis0201" in + let jis0208 = Netmappings.get_to_unicode "jis0208" in + let jis0212 = lazy (Netmappings.get_to_unicode "jis0212") in + (* seldom *) + let map1 x1 x2 = jis0208.(((x1 - 160) * 96) + x2 - 160) in + let map2 x1 _ = jis0201.(x1) in + let map3 x1 x2 = (Lazy.force jis0212).(((x1 - 160) * 96) + x2 - 160) in + read_euc 2 1 2 map1 map2 map3 `Enc_eucjp + +let read_euckr () = + let ks1001 = Netmappings.get_to_unicode "ks1001" in + let map x1 x2 = ks1001.(((x1 - 160) * 96) + x2 - 160) in + read_euc 2 0 0 map map map `Enc_euckr + +let read_subset inner_read def = + let read ops slice_char slice_blen s_in p_in l_in = + let open Netstring_tstring in + assert (Array.length slice_char = Array.length slice_blen); + assert (p_in >= 0 && p_in + l_in <= ops.length s_in && l_in >= 0); + + let n, k, enc' = inner_read.read ops slice_char slice_blen s_in p_in l_in in + + (* check codepoints: *) + for j = 0 to n - 1 do + if not (def slice_char.(j)) then ( + (* raise Malformed_code_read... *) + (* to get enc'' read again: *) + let slice_char' = Array.make j (-1) in + let slice_blen' = Array.make j 1 in + let n', k', enc'' = + try inner_read.read ops slice_char' slice_blen' s_in p_in l_in + with Malformed_code_read (_, _, _) -> assert false + in + assert (n' = j); + int_blit slice_char' 0 slice_char 0 j; + int_blit slice_blen' 0 slice_blen 0 j; + slice_char.(j) <- -1; + raise (Malformed_code_read (j, k', enc''))) + done; + + (n, k, enc') + in + { read } + +(* + * let (n_char, b_byte) = + * write_XXX slice_char slice_length s_out p_out l_out subst + *) + +let write_iso88591 maxcode slice_char slice_pos slice_length s_out p_out l_out + subst = + (* UNSAFE_OPT *) + (* Use maxcode=255 for ISO-8859-1, and maxcode=127 for US-ASCII, + * and maxcode=(-1) for `Enc_empty. + *) + assert (p_out >= 0 && p_out + l_out <= Bytes.length s_out && l_out >= 0); + assert (slice_pos >= 0 && slice_pos + slice_length <= Array.length slice_char); + assert (maxcode <= 255); + + let n = ref slice_pos in + (* index of slice *) + let n_ret = ref (-1) in + (* returned number of characters *) + let n_max = slice_pos + slice_length in + + let p = ref p_out in + (* current output position *) + let p_max = p_out + l_out in + + (* maximum output position *) + while !n < n_max && !p < p_max do + (* We know: + * (1) !n >= 0, because it starts with 0 and is only increased + * (2) !n < n_max = slice_pos + slice_length <= Array.length slice_char + * ==> unsafe get ok + *) + let ch = Array.unsafe_get slice_char !n in + if ch >= 0 && ch <= maxcode then ( + (* Because !p < p_max: + * !p < p_max = p_out + l_out <= String.length s_out + * Furthermore, p_out >= 0, !p >= 0. + * ==> unsafe set ok + *) + (* s_out.[ !p ] <- Char.chr ch; *) + Bytes.unsafe_set s_out !p (Char.unsafe_chr ch); + incr n; + incr p) + else ( + assert (ch >= 0); + let replacement = subst ch in + let l_repl = String.length replacement in + if l_repl > multibyte_limit then + failwith "Netconversion.write_iso88591: Substitution string too long"; + if !p + l_repl <= p_max then ( + (* Enough space to store 'replacement': *) + Bytes.blit_string replacement 0 s_out !p l_repl; + p := !p + l_repl; + incr n) + else ( + (* Exit whole conversion *) + n_ret := !n; + n := n_max)) + done; + if !n_ret >= 0 then (!n_ret - slice_pos, !p - p_out) + else (!n - slice_pos, !p - p_out) + +let get_8bit_from_unicode_map enc = + let cs = + match required_charsets enc with + | [ cs ] -> cs + | _ -> failwith "get_8bit_from_unicode_map" + in + let from_unicode = Netmappings.get_from_unicode (internal_name cs) in + assert (Array.length from_unicode = 256); + from_unicode + +let write_8bit enc = + (* UNSAFE_OPT *) + let m_from_unicode = get_8bit_from_unicode_map enc in + let m_mask = Array.length m_from_unicode - 1 in + + fun slice_char slice_pos slice_length s_out p_out l_out subst -> + assert (p_out >= 0 && p_out + l_out <= Bytes.length s_out && l_out >= 0); + assert ( + slice_pos >= 0 && slice_pos + slice_length <= Array.length slice_char); + + let n = ref slice_pos in + (* index of slice *) + let n_max = slice_pos + slice_length in + + let k = ref 0 in + (* written bytes *) + let n_ret = ref (-1) in + + (* returned number of characters *) + while !n < n_max && !k < l_out do + (* We know: + * (1) !n >= 0, because it starts with 0 and is only increased + * (2) !n < n_max = slice_pos + slice_length <= Array.length slice + * ==> unsafe get ok + *) + let p = + (* slice_char.( !n ) *) + Array.unsafe_get slice_char !n + in + let p' = + match Array.unsafe_get m_from_unicode (p land m_mask) with + | Netmappings.U_nil -> -1 + | Netmappings.U_single (p0, q0) -> if p0 = p then q0 else -1 + | Netmappings.U_double (p0, q0, p1, q1) -> + if p0 = p then q0 else if p1 = p then q1 else -1 + | Netmappings.U_array pq -> + let r = ref (-1) in + let h = ref 0 in + while !r < 0 && !h < Array.length pq do + if pq.(!h) = p then r := pq.(!h + 1) else h := !h + 2 + done; + !r + in + + (* If p=-1 ==> p'=-1, because -1 is never mapped to any code point *) + if p' < 0 then ( + if p < 0 then assert false (* EOF mark found *) + else + let replacement = subst p in + let l_repl = String.length replacement in + if l_repl > multibyte_limit then + failwith "Netconversion.write_8bit: Substitution string too long"; + + if !k + l_repl <= l_out then ( + (* Enough space to store 'replacement': *) + Bytes.blit_string replacement 0 s_out (p_out + !k) l_repl; + k := !k + l_repl; + incr n) + else ( + (* Exit whole conversion *) + n_ret := !n; + n := n_max)) + else ( + (* Because !k < l_out: + * p_out + !k < p_out + l_out <= String.length s_out + * Furthermore, p_out >= 0, !k >= 0. + * ==> unsafe set ok + *) + (* s_out.[ p_out + !k ] <- Char.chr p'; *) + Bytes.unsafe_set s_out (p_out + !k) (Char.unsafe_chr (p' land 0xff)); + incr n; + incr k) + done; + if !n_ret >= 0 then (!n_ret - slice_pos, !k) else (!n - slice_pos, !k) + +let write_utf8 is_java slice_char slice_pos slice_length s_out p_out l_out subst + = + (* UNSAFE_OPT *) + assert (p_out >= 0 && p_out + l_out <= Bytes.length s_out && l_out >= 0); + assert (slice_pos >= 0 && slice_pos + slice_length <= Array.length slice_char); + + let n = ref slice_pos in + (* index of slice *) + let n_max = slice_pos + slice_length in + + let k = ref 0 in + (* written bytes *) + let n_ret = ref (-1) in + + (* returned number of characters *) + while !n < n_max do + (* We know: + * (1) !n >= 0, because it starts with 0 and is only increased + * (2) !n < n_max = slice_pos + slice_length <= Array.length slice + * ==> unsafe get ok + *) + let p = + (* slice.( !n ) *) + Array.unsafe_get slice_char !n + in + + let index = p_out + !k in + + let k_inc = + (* k_inc: how many bytes are written. (-1) means: stop *) + if p <= 127 && ((not is_java) || p <> 0) then ( + if p < 0 then assert false; + (* EOF mark *) + if !k < l_out then ( + (* (1) index = p_out + !k < p_out + l_out <= + * String.length s_out + * (2) p_out, !n >= 0 + * ==> unsafe set ok + * + * 0 <= p <= 127 ==> unsafe_chr ok + *) + (* s_out.[index] <- Char.chr p; *) + Bytes.unsafe_set s_out index (Char.unsafe_chr p); + 1) + else -1) + else if p <= 0x7ff then + if !k + 1 < l_out then ( + (* (1) index+1 = p_out + !k + 1 < p_out + l_out <= + * String.length s_out + * (2) p_out, !k >= 0 + * ==> unsafe set ok + * + * p <= 0x7ff ==> p lsr 6 <= 0x1f + * ==> 0xc0 lor (p lsr 6) <= df + * p land 0x3f <= 0x3f ==> 0x80 lor (p land 0x3f) <= 0xbf + * ==> unsafe_chr ok + *) + (* s_out.[index] <- Char.chr (0xc0 lor (p lsr 6)); *) + (* s_out.[index + 1] <- Char.chr (0x80 lor (p land 0x3f)); *) + Bytes.unsafe_set s_out index (Char.unsafe_chr (0xc0 lor (p lsr 6))); + Bytes.unsafe_set s_out (index + 1) + (Char.unsafe_chr (0x80 lor (p land 0x3f))); + 2) + else -1 + else if p <= 0xffff then ( + (* Refuse writing surrogate pairs, and fffe, ffff *) + if (p >= 0xd800 && p < 0xe000) || p >= 0xfffe then + failwith "Netconversion.write_utf8"; + if !k + 2 < l_out then ( + (* (1) index+2 = p_out + !k + 2 < p_out + l_out <= + * String.length s_out + * (2) p_out, !k >= 0 + * ==> unsafe set ok + * + * Well, and it can be proven that unsafe_chr is ok, too... + *) + (* s_out.[index] <- Char.chr (0xe0 lor (p lsr 12)); *) + (* s_out.[index + 1] <- Char.chr (0x80 lor ((p lsr 6) land 0x3f)); *) + (* s_out.[index + 2] <- Char.chr (0x80 lor (p land 0x3f)); *) + Bytes.unsafe_set s_out index (Char.unsafe_chr (0xe0 lor (p lsr 12))); + Bytes.unsafe_set s_out (index + 1) + (Char.unsafe_chr (0x80 lor ((p lsr 6) land 0x3f))); + Bytes.unsafe_set s_out (index + 2) + (Char.unsafe_chr (0x80 lor (p land 0x3f))); + 3) + else -1) + else if p <= 0x10ffff then + if !k + 3 < l_out then ( + (* No such characters are defined... *) + Bytes.set s_out index (Char.chr (0xf0 lor (p lsr 18))); + Bytes.set s_out (index + 1) + (Char.chr (0x80 lor ((p lsr 12) land 0x3f))); + Bytes.set s_out (index + 2) + (Char.chr (0x80 lor ((p lsr 6) land 0x3f))); + Bytes.set s_out (index + 3) (Char.chr (0x80 lor (p land 0x3f))); + 4) + else -1 + else + (* Higher code points are not possible in XML; call subst *) + let replacement = subst p in + let l_repl = String.length replacement in + if l_repl > multibyte_limit then + failwith "Netconversion.write_utf8: Substitution string too long"; + if !k + l_repl <= l_out then ( + (* Enough space to store 'replacement': *) + Bytes.blit_string replacement 0 s_out (p_out + !k) l_repl; + l_repl (* may be 0! *)) + else -1 (* Exit whole conversion *) + in + + if k_inc >= 0 then ( + k := !k + k_inc; + incr n) + else ( + n_ret := !n; + n := n_max) + done; + if !n_ret >= 0 then (!n_ret - slice_pos, !k) else (!n - slice_pos, !k) + +let write_utf16_lebe lo hi slice_char slice_pos slice_length s_out p_out l_out + subst = + (* lo=0, hi=1: little endian + * lo=1, hi=0: big endian + *) + assert (p_out >= 0 && p_out + l_out <= Bytes.length s_out && l_out >= 0); + assert (slice_pos >= 0 && slice_pos + slice_length <= Array.length slice_char); + + let n = ref slice_pos in + (* index of slice *) + let n_max = slice_pos + slice_length in + + let k = ref 0 in + (* written bytes *) + let n_ret = ref (-1) in + + (* returned number of characters *) + while !n < n_max do + let p = slice_char.(!n) in + + let index = p_out + !k in + + let k_inc = + if p >= 0xfffe then ( + if p <= 0x10ffff then ( + if p <= 0xffff then failwith "Netconversion.write_utf16_le"; + (* Must be written as surrogate pair *) + if !k + 3 < l_out then ( + let high = ((p - 0x10000) lsr 10) + 0xd800 in + let low = (p land 0x3ff) + 0xdc00 in + Bytes.set s_out (index + lo) (Char.chr (high land 0xff)); + Bytes.set s_out (index + hi) (Char.chr (high lsr 8)); + Bytes.set s_out (index + 2 + lo) (Char.chr (low land 0xff)); + Bytes.set s_out (index + 2 + hi) (Char.chr (low lsr 8)); + 4) + else -1) + else + (* Higher code points are not possible in XML; call subst *) + let replacement = subst p in + let l_repl = String.length replacement in + if l_repl > multibyte_limit then + failwith + "Netconversion.write_utf16_le: Substitution string too long"; + if !k + l_repl <= l_out then ( + (* Enough space to store 'replacement': *) + Bytes.blit_string replacement 0 s_out (p_out + !k) l_repl; + l_repl (* may be 0! *)) + else -1 (* Exit whole conversion *)) + else if (* 2-byte character *) + !k + 1 < l_out then ( + Bytes.set s_out (index + lo) (Char.unsafe_chr (p land 0xff)); + Bytes.set s_out (index + hi) (Char.unsafe_chr ((p lsr 8) land 0xff)); + 2) + else -1 + in + if k_inc >= 0 then ( + k := !k + k_inc; + incr n) + else ( + n_ret := !n; + n := n_max) + done; + if !n_ret >= 0 then (!n_ret - slice_pos, !k) else (!n - slice_pos, !k) + +let write_utf32_lebe little slice_char slice_pos slice_length s_out p_out l_out + subst = + assert (p_out >= 0 && p_out + l_out <= Bytes.length s_out && l_out >= 0); + assert (slice_pos >= 0 && slice_pos + slice_length <= Array.length slice_char); + + let n = ref slice_pos in + (* index of slice *) + let n_max = slice_pos + slice_length in + + let k = ref 0 in + (* written bytes *) + let n_ret = ref (-1) in + + (* returned number of characters *) + let b0 = if little then 0 else 3 in + let b1 = if little then 1 else 2 in + let b2 = if little then 2 else 1 in + let b3 = if little then 3 else 0 in + + while !n < n_max do + let p = slice_char.(!n) in + + let index = p_out + !k in + + let k_inc = + if p <= 0x10ffff then + if !k + 3 < l_out then ( + Bytes.set s_out (index + b0) (Char.unsafe_chr (p land 0xff)); + Bytes.set s_out (index + b1) (Char.unsafe_chr ((p lsr 8) land 0xff)); + Bytes.set s_out (index + b2) (Char.unsafe_chr ((p lsr 16) land 0xff)); + Bytes.set s_out (index + b3) (Char.unsafe_chr 0); + 4) + else -1 + else + (* Higher code points are not possible in XML; call subst *) + let replacement = subst p in + let l_repl = String.length replacement in + if l_repl > multibyte_limit then + failwith "Netconversion.write_utf32: Substitution string too long"; + if !k + l_repl <= l_out then ( + (* Enough space to store 'replacement': *) + Bytes.blit_string replacement 0 s_out (p_out + !k) l_repl; + l_repl (* may be 0! *)) + else -1 + (* Exit whole conversion *) + in + if k_inc >= 0 then ( + k := !k + k_inc; + incr n) + else ( + n_ret := !n; + n := n_max) + done; + if !n_ret >= 0 then (!n_ret - slice_pos, !k) else (!n - slice_pos, !k) + +let write_euc map _enc + (* Code set 0 is US-ASCII. + * let (set, byte1, byte2) = map unicode: + * - set is 1, 2, 3, or 4. 4 means that the code point cannot be mapped. + * - byte1 >= 160, <= 255 + * - byte2 >= 160, <= 255, or byte2=256 meaning that it is not used + *) + (* UNSAFE_OPT *) + slice_char slice_pos slice_length s_out p_out l_out subst = + assert (p_out >= 0 && p_out + l_out <= Bytes.length s_out && l_out >= 0); + assert (slice_pos >= 0 && slice_pos + slice_length <= Array.length slice_char); + + let n = ref slice_pos in + (* index of slice *) + let n_max = slice_pos + slice_length in + + let k = ref 0 in + (* written bytes *) + let n_ret = ref (-1) in + + (* returned number of characters *) + while !n < n_max do + (* We know: + * (1) !n >= 0, because it starts with 0 and is only increased + * (2) !n < n_max = slice_pos + slice_length <= Array.length slice + * ==> unsafe get ok + *) + let p = + (* slice.( !n ) *) + Array.unsafe_get slice_char !n + in + assert (p >= 0); + + let index = p_out + !k in + + let set, b1, b2 = if p <= 127 then (0, p, 256) else map p in + + let k_inc = + (* k_inc: how many bytes are written *) + match set with + | 0 -> + if !k < l_out then ( + (* s_out.[index] <- Char.chr p; *) + Bytes.unsafe_set s_out index (Char.unsafe_chr (b1 land 127)); + 1) + else -1 + | 1 -> + let bl = if b2 = 256 then 1 else 2 in + if !k + bl < l_out then ( + assert (b1 >= 160 && b1 <= 255 && b2 >= 160 && b2 <= 256); + Bytes.set s_out index (Char.chr b1); + if b2 <> 256 then Bytes.set s_out (index + 1) (Char.chr b2); + bl) + else -1 + | 2 -> + let bl = if b2 = 256 then 2 else 3 in + if !k + bl < l_out then ( + assert (b1 >= 160 && b1 <= 255 && b2 >= 160 && b2 <= 256); + Bytes.set s_out index '\142'; + Bytes.set s_out (index + 1) (Char.chr b1); + if b2 <> 256 then Bytes.set s_out (index + 2) (Char.chr b2); + bl) + else -1 + | 3 -> + let bl = if b2 = 256 then 2 else 3 in + if !k + bl < l_out then ( + assert (b1 >= 160 && b1 <= 255 && b2 >= 160 && b2 <= 256); + Bytes.set s_out index '\143'; + Bytes.set s_out (index + 1) (Char.chr b1); + if b2 <> 256 then Bytes.set s_out (index + 2) (Char.chr b2); + bl) + else -1 + | 4 -> + let replacement = subst p in + let l_repl = String.length replacement in + if l_repl > multibyte_limit then + failwith "Netconversion.write_euc: Substitution string too long"; + if !k + l_repl <= l_out then ( + (* Enough space to store 'replacement': *) + Bytes.blit_string replacement 0 s_out (p_out + !k) l_repl; + l_repl) + else -1 + (* Exit whole conversion *) + | _ -> assert false + in + if k_inc >= 0 then ( + k := !k + k_inc; + incr n) + else ( + n_ret := !n; + n := n_max) + done; + if !n_ret >= 0 then (!n_ret - slice_pos, !k) else (!n - slice_pos, !k) + +let write_eucjp () = + let jis0201 = Netmappings.get_from_unicode "jis0201" in + let jis0208 = Netmappings.get_from_unicode "jis0208" in + let jis0212 = Netmappings.get_from_unicode "jis0212" in + + let jis0201_mask = Array.length jis0201 - 1 in + let jis0208_mask = Array.length jis0208 - 1 in + let jis0212_mask = Array.length jis0212 - 1 in + + let map p = + (* Try in order: jis0208, jis0201, jis0212 *) + let map_tbl jistbl jistbl_mask = + match jistbl.(p land jistbl_mask) with + | Netmappings.U_nil -> -1 + | Netmappings.U_single (p0, q0) -> if p0 = p then q0 else -1 + | Netmappings.U_double (p0, q0, p1, q1) -> + if p0 = p then q0 else if p1 = p then q1 else -1 + | Netmappings.U_array pq -> + let r = ref (-1) in + let h = ref 0 in + while !r < 0 && !h < Array.length pq do + if pq.(!h) = p then r := pq.(!h + 1) else h := !h + 2 + done; + !r + in + let cp_0208 = map_tbl jis0208 jis0208_mask in + if cp_0208 >= 0 then + let row = cp_0208 / 96 in + let col = cp_0208 - (row * 96) in + (1, row + 160, col + 160) + else + let cp_0201 = map_tbl jis0201 jis0201_mask in + if cp_0201 >= 128 then (* Ignore especially 0x5c, 0x7e *) + (2, cp_0201, 256) + else + let cp_0212 = map_tbl jis0212 jis0212_mask in + if cp_0212 >= 0 then + let row = cp_0212 / 96 in + let col = cp_0212 - (row * 96) in + (3, row + 160, col + 160) + else (4, 256, 256) + in + write_euc map `Enc_eucjp + +let write_euckr () = + let ks1001 = Netmappings.get_from_unicode "ks1001" in + + let ks1001_mask = Array.length ks1001 - 1 in + + let map p = + let map_tbl kstbl kstbl_mask = + match kstbl.(p land kstbl_mask) with + | Netmappings.U_nil -> -1 + | Netmappings.U_single (p0, q0) -> if p0 = p then q0 else -1 + | Netmappings.U_double (p0, q0, p1, q1) -> + if p0 = p then q0 else if p1 = p then q1 else -1 + | Netmappings.U_array pq -> + let r = ref (-1) in + let h = ref 0 in + while !r < 0 && !h < Array.length pq do + if pq.(!h) = p then r := pq.(!h + 1) else h := !h + 2 + done; + !r + in + let cp_1001 = map_tbl ks1001 ks1001_mask in + if cp_1001 >= 0 then + let row = cp_1001 / 96 in + let col = cp_1001 - (row * 96) in + (1, row + 160, col + 160) + else (4, 256, 256) + in + write_euc map `Enc_euckr + +let special_cpoint = 0x110000 + +let write_subset inner_writer def slice_char slice_pos slice_length s_out p_out + l_out subst = + assert (p_out >= 0 && p_out + l_out <= Bytes.length s_out && l_out >= 0); + assert (slice_pos >= 0 && slice_pos + slice_length <= Array.length slice_char); + + (* Force that the subst' function is called for all undefined code + * points + *) + let slice_char' = Array.sub slice_char slice_pos slice_length in + for n = 0 to slice_length - 1 do + let ch = slice_char'.(n) in + if ch >= special_cpoint || not (def ch) then + slice_char'.(n) <- special_cpoint + n + done; + + let subst' ch = + if ch >= special_cpoint then + subst slice_char.(slice_pos + ch - special_cpoint) + else subst ch + in + + inner_writer slice_char' 0 slice_length s_out p_out l_out subst' + +type encoding1 = [ encoding | `Enc_utf16_bom | `Enc_utf32_bom | `Enc_utf8_bom ] + +(* `Enc_*_bom considers the BOM as a character with code point -3. + * This encoding is only internally used. + *) + +let rec get_reader1 (enc : encoding1) = + (* get_reader1 supports the additional internal encodings of + * encoding1. get_reader (below) only supports the exported + * encodings. + *) + match enc with + | `Enc_iso88591 -> !read_iso88591_ref 255 `Enc_iso88591 + | `Enc_usascii -> !read_iso88591_ref 127 `Enc_usascii + | `Enc_empty -> !read_iso88591_ref (-1) `Enc_empty + | `Enc_utf8 -> !read_utf8_ref false + | `Enc_java -> !read_utf8_ref true + | `Enc_utf8_opt_bom -> read_utf8_opt_bom false + | `Enc_utf8_bom -> read_utf8_opt_bom true + | `Enc_utf16 -> read_utf16 false + | `Enc_utf16_bom -> read_utf16 true + | `Enc_utf16_le -> read_utf16_lebe 0 1 0 `Enc_utf16_le + | `Enc_utf16_be -> read_utf16_lebe 1 0 0 `Enc_utf16_be + | `Enc_utf32 -> read_utf32 false + | `Enc_utf32_bom -> read_utf32 true + | `Enc_utf32_le -> read_utf32_lebe true 0 `Enc_utf32_le + | `Enc_utf32_be -> read_utf32_lebe false 0 `Enc_utf32_be + | `Enc_eucjp -> read_eucjp () + | `Enc_euckr -> read_euckr () + | `Enc_subset (e, def) -> + let reader' = get_reader1 (e :> encoding1) in + read_subset reader' def + | #encoding as e -> read_8bit (e :> encoding) + +let get_reader = (get_reader1 : encoding1 -> 'a :> encoding -> 'a) + +let rec get_writer enc = + match enc with + | `Enc_iso88591 -> write_iso88591 255 + | `Enc_usascii -> write_iso88591 127 + | `Enc_empty -> write_iso88591 (-1) + | `Enc_utf8 -> write_utf8 false + | `Enc_java -> write_utf8 true + | `Enc_utf16 -> + failwith + "Netconversion: Cannot output text as `Enc_utf16, use `Enc_utf16_le or \ + `Enc_utf16_be" + | `Enc_utf16_le -> write_utf16_lebe 0 1 + | `Enc_utf16_be -> write_utf16_lebe 1 0 + | `Enc_utf32 -> + failwith + "Netconversion: Cannot output text as `Enc_utf32, use `Enc_utf32_le or \ + `Enc_utf32_be" + | `Enc_utf32_le -> write_utf32_lebe true + | `Enc_utf32_be -> write_utf32_lebe false + | `Enc_eucjp -> write_eucjp () + | `Enc_euckr -> write_euckr () + | `Enc_subset (e, def) -> + let writer' = get_writer e in + write_subset writer' def + | _ -> write_8bit enc + +let recode_poly ~in_ops ~in_enc ~in_buf ~in_pos ~in_len ~out_enc ~out_buf + ~out_pos ~out_len ~max_chars ~subst = + let open Netstring_tstring in + if + in_pos < 0 || in_len < 0 + || in_pos + in_len > in_ops.length in_buf + || out_pos < 0 || out_len < 0 + || out_pos + out_len > Bytes.length out_buf + then invalid_arg "Netconversion.recode"; + + (* An array with 250 elements can be allocated in the minor heap. *) + let slice_length = big_slice in + let slice_char = Array.make slice_length (-1) in + let slice_blen = Array.make slice_length 1 in + + let in_k = ref 0 in + (* read bytes *) + let in_n = ref 0 in + (* read characters *) + let in_eof = ref (!in_k >= in_len) in + let out_k = ref 0 in + (* written bytes *) + let out_n = ref 0 in + (* written characters *) + let out_eof = ref (!out_k >= out_len || !out_n >= max_chars) in + + let rd_enc = ref in_enc in + let reader = ref (get_reader in_enc) in + let writer = get_writer out_enc in + + while (not !in_eof) && not !out_eof do + let in_n_inc, in_k_inc, rd_enc' = + try + !reader.read in_ops slice_char slice_blen in_buf (in_pos + !in_k) + (in_len - !in_k) + with Malformed_code_read (in_n_inc, in_k_inc, rd_enc') -> + if in_n_inc = 0 then raise Malformed_code; + (in_n_inc, in_k_inc, rd_enc') + in + + let out_n_inc_max = min in_n_inc (max_chars - !out_n) in + (* do not write more than max_chars *) + let out_n_inc, out_k_inc = + if out_n_inc_max > 0 then + writer slice_char 0 out_n_inc_max out_buf (out_pos + !out_k) + (out_len - !out_k) subst + else (0, 0) + in + + let in_n_inc', in_k_inc' = + if in_n_inc > out_n_inc then ( + (* Not all read characters could be written *) + let sum = ref 0 in + for j = 0 to out_n_inc - 1 do + sum := !sum + slice_blen.(j) + done; + (out_n_inc, !sum)) + else (in_n_inc, in_k_inc) + in + + in_k := !in_k + in_k_inc'; + in_n := !in_n + in_n_inc'; + out_k := !out_k + out_k_inc; + out_n := !out_n + out_n_inc; + + (* Detect change of input encoding: *) + if rd_enc' <> !rd_enc then ( + rd_enc := rd_enc'; + reader := get_reader rd_enc'; + Array.fill slice_blen 0 slice_length 1); + + (* EOF criteria: + * - It is possible that !in_k never reaches in_len because there is a + * multibyte character at the end that is partially outside the input + * range + * - For the same reason it is possible that !out_k never reaches out_len + * - It is accepted as reader EOF if not even one character can be + * scanned + * - It is accepted as writer EOF if fewer than in_n_inc characters + * could be written + *) + in_eof := !in_k >= in_len || in_n_inc = 0; + + out_eof := !out_k >= out_len || !out_n >= max_chars || out_n_inc < in_n_inc + done; + + (!in_k, !out_k, !rd_enc) + +let rec ustring_of_uchar enc = + let multi_byte writer n p = + let s = Bytes.create n in + let _, n_act = + writer [| p |] 0 1 s 0 n (fun _ -> raise (Cannot_represent p)) + in + Bytes.sub_string s 0 n_act + in + match enc with + | `Enc_iso88591 -> + fun p -> + if p > 255 then raise (Cannot_represent p); + String.make 1 (Char.chr p) + | `Enc_usascii -> + fun p -> + if p > 127 then raise (Cannot_represent p); + String.make 1 (Char.chr p) + | `Enc_utf8 | `Enc_utf8_opt_bom -> multi_byte (write_utf8 false) 4 + | `Enc_java -> multi_byte (write_utf8 true) 4 + | `Enc_utf16_le -> multi_byte (write_utf16_lebe 0 1) 4 + | `Enc_utf16_be -> multi_byte (write_utf16_lebe 1 0) 4 + | `Enc_utf16 -> + invalid_arg "Netconversion.ustring_of_uchar: UTF-16 not possible" + | `Enc_utf32_le -> multi_byte (write_utf32_lebe true) 4 + | `Enc_utf32_be -> multi_byte (write_utf32_lebe false) 4 + | `Enc_utf32 -> + invalid_arg "Netconversion.ustring_of_uchar: UTF-32 not possible" + | `Enc_eucjp -> multi_byte (write_eucjp ()) 3 + | `Enc_euckr -> multi_byte (write_euckr ()) 2 + | `Enc_subset (e, def) -> + fun p -> + if def p then ustring_of_uchar e p else raise (Cannot_represent p) + | _ -> + let writer = write_8bit enc in + multi_byte writer 1 + +let makechar enc = + let us = ustring_of_uchar enc in + fun p -> try us p with Cannot_represent _ -> raise Not_found + +(* The following algorithms assume that there is an upper limit of the length + * of a multibyte character. Currently, UTF8 is the encoding with the longest + * multibyte characters (6 bytes). + * Because of this limit, it is allowed to allocate a buffer that is "large + * enough" in order to ensure that at least one character is recoded in every + * loop cycle. If the buffer was not large enough, no character would be + * processed in a cycle, and the algorithm would hang. + *) + +let convert_poly : + type s t. + in_ops:s Netstring_tstring.tstring_ops -> + out_kind:t Netstring_tstring.tstring_kind -> + ?subst:(int -> string) -> + in_enc:encoding -> + out_enc:encoding -> + ?range_pos:int -> + ?range_len:int -> + s -> + t = + fun ~in_ops ~out_kind ?(subst = fun p -> raise (Cannot_represent p)) ~in_enc + ~out_enc ?(range_pos = 0) ?range_len s -> + let open Netstring_tstring in + let range_len = + match range_len with Some l -> l | None -> in_ops.length s - range_pos + in + + if range_pos < 0 || range_len < 0 || range_pos + range_len > in_ops.length s + then invalid_arg "Netconversion.convert"; + + (* Estimate the size of the output string: + * length * 2 is just guessed. It is assumed that this number is usually + * too large, and to avoid that too much memory is wasted, the buffer is + * limited by 10000. + *) + let size = ref (max multibyte_limit (min 10000 (range_len * 2))) in + let out_buf = ref (Bytes.create !size) in + + let k_in = ref 0 in + let k_out = ref 0 in + + while !k_in < range_len do + let in_len = range_len - !k_in in + let out_len = !size - !k_out in + assert (out_len >= multibyte_limit); + (* space for at least one char *) + let k_in_inc, k_out_inc, _in_enc' = + recode_poly ~in_ops ~in_enc ~in_buf:s ~in_pos:(range_pos + !k_in) ~in_len + ~out_enc ~out_buf:!out_buf ~out_pos:!k_out ~out_len ~max_chars:max_int + ~subst + in + if k_in_inc = 0 then raise Malformed_code; + (* Reasons for k_in_inc = 0: + * (1) There is not enough space in out_buf to add a single character + * (2) in_buf ends with a prefix of a multi-byte character + * Because there is always space for at least one character + * ( = multibyte_limit ), reason (1) can be excluded. So it must + * be (2), and we can raise Malformed_code. + *) + k_in := !k_in + k_in_inc; + k_out := !k_out + k_out_inc; + (* double the size of out_buf: *) + let size' = min Sys.max_string_length (!size + !size) in + if size' < !size + multibyte_limit then + failwith "Netconversion.convert: string too long"; + let out_buf' = Bytes.create size' in + Bytes.blit !out_buf 0 out_buf' 0 !k_out; + out_buf := out_buf'; + size := size' + done; + match out_kind with + | Netstring_tstring.String_kind -> Bytes.sub_string !out_buf 0 !k_out + | Netstring_tstring.Bytes_kind -> Bytes.sub !out_buf 0 !k_out + +let convert ?subst ~in_enc ~out_enc ?range_pos ?range_len s = + convert_poly ?subst ~in_ops:Netstring_tstring.string_ops + ~out_kind:Netstring_tstring.String_kind ~in_enc ~out_enc ?range_pos + ?range_len s + +let uarray_of_ustring_poly ops enc ?(range_pos = 0) ?range_len s = + let open Netstring_tstring in + let range_len = + match range_len with Some l -> l | None -> ops.length s - range_pos + in + + if range_pos < 0 || range_len < 0 || range_pos + range_len > ops.length s then + invalid_arg "Netconversion.uarray_of_ustring"; + + let slice_length = big_slice in + let slice_char = Array.make slice_length (-1) in + let slice_blen = Array.make slice_length 1 in + + let k = ref 0 in + let e = ref enc in + let reader = ref (get_reader enc) in + let buf = ref [] in + + while !k < range_len do + let n_inc, k_inc, enc' = + try + !reader.read ops slice_char slice_blen s (range_pos + !k) + (range_len - !k) + with Malformed_code_read (_, _, _) -> raise Malformed_code + in + + k := !k + k_inc; + buf := Array.sub slice_char 0 n_inc :: !buf; + + if enc' <> !e then ( + e := enc'; + reader := get_reader enc'; + Array.fill slice_blen 0 slice_length 1); + + if n_inc < slice_length then ( + (* EOF *) + if !k < range_len then raise Malformed_code; + (* s ends with multi-byte prefix*) + k := range_len) + done; + + Array.concat (List.rev !buf) + +let uarray_of_ustring enc = + uarray_of_ustring_poly Netstring_tstring.string_ops enc + +let ustring_of_uarray_poly out_kind + ?(subst = fun code -> raise (Cannot_represent code)) enc ?(pos = 0) ?len ua + = + let len = match len with Some l -> l | None -> Array.length ua - pos in + + if pos < 0 || len < 0 || pos + len > Array.length ua then + invalid_arg "Netconversion.ustring_of_uarray"; + + (* Estimate the size of the output string: + * length * 2 is just guessed. It is assumed that this number is usually + * too large, and to avoid that too much memory is wasted, the buffer is + * limited by 10000. + *) + let size = ref (max multibyte_limit (min 10000 (len * 2))) in + let out_buf = ref (Bytes.create !size) in + + let writer = get_writer enc in + + let k_in = ref 0 in + let k_out = ref 0 in + + while !k_in < len do + let k_in_inc, k_out_inc = + writer ua (pos + !k_in) (len - !k_in) !out_buf !k_out (!size - !k_out) + subst + in + k_in := !k_in + k_in_inc; + k_out := !k_out + k_out_inc; + + (* double the size of out_buf: *) + let size' = min Sys.max_string_length (!size + !size) in + if size' < !size + multibyte_limit then + failwith "Netconversion.ustring_of_uarray: string too long"; + let out_buf' = Bytes.create size' in + Bytes.blit !out_buf 0 out_buf' 0 !k_out; + out_buf := out_buf'; + size := size' + done; + + Netstring_tstring.bytes_subpoly out_kind !out_buf 0 !k_out + +let ustring_of_uarray ?subst = + ustring_of_uarray_poly Netstring_tstring.String_kind ?subst diff --git a/ocamlnet_lite/netconversion.mli b/ocamlnet_lite/netconversion.mli new file mode 100644 index 0000000..3a9d921 --- /dev/null +++ b/ocamlnet_lite/netconversion.mli @@ -0,0 +1,504 @@ +(** Conversion between character encodings + * + * {b Contents} + * {ul + * {- {!Netconversion.preliminaries} + * {ul + * {- {!Netconversion.unicode}} + * {- {!Netconversion.subsets}} + * {- {!Netconversion.linking}} + * {- {!Netconversion.domain}} + * {- {!Netconversion.problems}}}} + * {- {!Netconversion.interface} + * {ul + * {- {!Netconversion.direct_conv}} + * {- {!Netconversion.cursors} + * {ul {- {!Netconversion.bom}}}} + * {- {!Netconversion.unicode_functions}} + * } + * } + * } + *) + +(** {1:preliminaries Preliminaries} + * + * A {b character set} is a set of characters where every character is + * identified by a {b code point}. An {b encoding} is a way of + * representing characters from a set in byte strings. For example, + * the Unicode character set has more than 96000 characters, and + * the code points have values from 0 to 0x10ffff (not all code points + * are assigned yet). The UTF-8 encoding represents the code points + * by sequences of 1 to 4 bytes. There are also encodings that + * represent code points from several sets, e.g EUC-JP covers four + * sets. + * + * Encodings are enumerated by the type [encoding], and names follow + * the convention [`Enc_*], e.g. [`Enc_utf8]. + * Character sets are enumerated by the type + * [charset], and names follow the convention [`Set_*], e.g. + * [`Set_unicode]. + * + * This module deals mainly with encodings. It is important to know + * that the same character set may have several encodings. For example, + * the Unicode character set can be encoded as UTF-8 or UTF-16. + * For the 8 bit character sets, however, there is usually only one + * encoding, e.g [`Set_iso88591] is always encoded as [`Enc_iso88591]. + * + * In a {b single-byte encoding} every code point is represented by + * one byte. This is what many programmers are accustomed at, and + * what the OCaml language specially supports: A [string] is + * a sequence of [char]s, where [char] means an 8 bit quantity + * interpreted as character. For example, the following piece of code allocates + * a [string] of four [char]s, and assigns them individually: + * + * {[ + * let s = String.create 4 in + * s.[0] <- 'G'; + * s.[1] <- 'e'; + * s.[2] <- 'r'; + * s.[3] <- 'd'; + * ]} + * + * In a {b multi-byte encoding} there are code points that are represented + * by several bytes. As we still represent such text as [string], the + * problem arises that a single [char], actually a byte, often represents + * only a fraction of a full multi-byte character. There are two solutions: + * - Give up the principle that text is represented by [string]. + * This is, for example, the approach chosen by [Camomile], another OCaml + * library dealing with Unicode. Instead, text is represented as + * [int array]. This way, the algorithms processing the text can + * remain the same. + * - Give up the principle that individual characters can be directly + * accessed in a text. This is the primary way chosen by Ocamlnet. + * This means that there is not any longer the possibility to read + * or write the [n]th character of a text. One can, however, still + * compose texts by just concatenating the strings representing + * individual characters. Furthermore, it is possible to define + * a cursor for a text that moves sequentially along the text. + * The consequence is that programmers are restricted to sequential + * algorithms. Note that the majority of text processing falls into + * this class. + * + * The corresponding piece of code for Ocamlnet's Unicode implementation + * is: + * {[ + * let b = Buffer.create 80 in + * Buffer.add b (ustring_of_uchar `Enc_utf8 71); (* 71 = code point of 'G' *) + * Buffer.add b (ustring_of_uchar `Enc_utf8 101); (* 101 = code point of 'e' *) + * Buffer.add b (ustring_of_uchar `Enc_utf8 114); (* 114 = code point of 'r' *) + * Buffer.add b (ustring_of_uchar `Enc_utf8 100); (* 100 = code point of 'd' *) + * let s = Buffer.contents b + * ]} + * + * It is important to always remember that a [char] is no longer + * a character but simply a byte. In many of the following explanations, + * we strictly distinguish between {b byte positions} or {b byte counts}, + * and {b character positions} or {b character counts}. + * + * There a number of special effects that usually only occur in + * multi-byte encodings: + * + * - Bad encodings: Not every byte sequence is legal. When scanning + * such text, the functions will raise the exception [Malformed_code] + * when they find illegal bytes. + * - Unassigned code points: It may happen that a byte sequence is + * a correct representation for a code point, but that the code point + * is unassigned in the character set. When scanning, this is also + * covered by the exception [Malformed_code]. When converting from + * one encoding to another, it is also possible that the code point + * is only unassigned in the target character set. This case is + * usually handled by a substitution function [subst], and if no such + * function is defined, by the exception [Cannot_represent]. + * - Incomplete characters: The trailing bytes of a string may be the + * correct beginning of a byte sequence for a character, but not a + * complete sequence. Of course, if that string is the end of a + * text, this is just illegal, and also a case for [Malformed_code]. + * However, when text is processed chunk by chunk, this phenomenon + * may happen legally for all chunks but the last. For this reason, + * some of the functions below handle this case specially. + * - Byte order marks: Some encodings have both big and little endian + * variants. A byte order mark at the beginning of the text declares + * which variant is actually used. This byte order mark is a + * declaration written like a character, but actually not a + * character. + * + * There is a special class of encodings known as {b ASCII-compatible}. + * They are important because there are lots of programs and protocols + * that only interpret bytes from 0 to 127, and treat the bytes from + * 128 to 255 as data. These programs can process texts as long as + * the bytes from 0 to 127 are used as in ASCII. Fortunately, many + * encodings are ASCII-compatible, including UTF-8. + * + * {2:unicode Unicode} + * + * [Netconversion] is centred around Unicode. + * The conversion from one encoding to another works by finding the + * Unicode code point of the character + * to convert, and by representing the code point in the target encoding, + * even if neither encodings have to do with Unicode. + * Of course, this approach requires that all character sets handled + * by [Netconversion] are subsets of Unicode. + * + * The supported range of Unicode code points: 0 to 0xd7ff, 0xe000 to 0xfffd, + * 0x10000 to 0x10ffff. All these code points can be represented in + * UTF-8 and UTF-16. [Netconversion] does not know which of the code + * points are assigned and which not, and because of this, it simply + * allows all code points of the mentioned ranges (but for other character + * sets, the necessary lookup tables exist). + * + * {b UTF-8:} The UTF-8 representation can have one to four bytes. Malformed + * byte sequences are always rejected, even those that want to cheat the + * reader like "0xc0 0x80" for the code point 0. There is special support + * for the Java variant of UTF-8 ([`Enc_java]). [`Enc_utf8] strings must not + * have a byte order mark (it would be interpreted as "zero-width space" + * character). However, the Unicode standard allows byte order marks + * at the very beginning of texts; use [`Enc_utf8_opt_bom] in this case. + * + * {b UTF-16:} When reading from a string encoded as [`Enc_utf16], a byte + * order mark is expected at the beginning. The detected variant + * ([`Enc_utf16_le] or [`Enc_utf16_be]) is usually returned by the parsing + * function. The byte order mark is not included into the output string. - + * Some functions of this + * module cannot cope with [`Enc_utf16] (i.e. UTF-16 without endianess + * annotation), and will fail. + * + * Once the endianess is determined, the code point 0xfeff is no longer + * interpreted as byte order mark, but as "zero-width non-breakable space". + * + * Some code points are represented by pairs of 16 bit values, these + * are the so-called "surrogate pairs". They can only occur in UTF-16. + * + * {b UTF-32:} This is very much the same as for UTF-16. There is a little + * endian version [`Enc_utf32_le] and a big endian version [`Enc_utf32_be]. + * + * {2:subsets Subsets of Unicode} + * + * The non-Unicode character sets are subsets of Unicode. Here, it may + * happen that a Unicode code point does not have a corresponding + * code point. In this case, certain rules are applied to handle + * this (see below). It is, however, ensured that every non-Unicode + * code point has a corresponding Unicode code point. (In other words, + * character sets cannot be supported for which this property does + * not hold.) + * + * It is even possible to create further subsets artificially. The + * encoding [`Enc_subset(e,def)] means to derive a new encoding from + * the existing one [e], but to only accept the code points for which + * the definition function [def] yields the value [true]. For example, + * the encoding + * {[ `Enc_subset(`Enc_usascii, + * fun i -> i <> 34 && i <> 38 && i <> 60 && i <> 62) ]} + * is ASCII without the bracket angles, the quotation mark, and the + * ampersand character, i.e. the subset of ASCII that can be included + * in HTML text without escaping. + * + * If a code point is not defined by the encoding but found in a text, + * the reader will raise the exception [Malformed_code]. When text is + * output, however, the [subst] function will be called for undefined code + * points (which raises [Cannot_represent] by default). The [subst] + * function is an optional argument of many conversion functions that + * allows it to insert a substitution text for undefined code points. + * Note, however, that the substitution text is restricted to at most + * 50 characters (because unlimited length would lead to difficult + * problems we would like to avoid). + * + * {2:linking Linking this module} + * + * Many encodings require lookup tables. The following encodings + * are built-in and always supported: + * + * - Unicode: [`Enc_utf8], [`Enc_java], [`Enc_utf16], [`Enc_utf16_le], + [`Enc_utf16_be], [`Enc_utf32], [`Enc_utf32_le], [`Enc_utf32_be] + * - Other: [`Enc_usascii], [`Enc_iso88591], [`Enc_empty] + * + * The lookup tables for the other encodings are usually loaded at + * runtime, but it is also possible to embed them in the generated + * binary executable. See {!Netunidata} for details. The functions + * [available_input_encodings] and [available_output_encodings] can + * be invoked to find out which encodings can be loaded, or are available + * otherwise. + * + * {2:domain Supported Encodings, Restrictions} + * + * I took the mappings from [www.unicode.org], and the standard names of + * the character sets from IANA. Obviously, many character sets are missing + * that can be supported; especially ISO646 character sets, and many EBCDIC + * code pages. Stateful encodings like generic ISO-2022 have been omitted + * (stateless subsets of ISO-2022 like EUC can be supported, however; + * currently we support EUC-JP and EUC-KR). + * + * Because of the copyright statement from Unicode, I cannot put the + * source tables that describe the mappings into the distribution. They + * are publicly available from [www.unicode.org]. + * + * {2:problems Known Problems} + * + * - The following charsets do not have a bijective mapping to Unicode: + * adobe_standard_encoding, adobe_symbol_encoding, + * adobe_zapf_dingbats_encoding, cp1002 (0xFEBE). The current implementation + * simply removes one of the conflicting code point pairs - this might + * not what you want. + * - Japanese encodings: + * JIS X 0208: The character 1/32 is mapped to 0xFF3C, and not + * to 0x005C. + *) + +(** {1:interface Interface} + * + * {b Naming conventions:} + * + * As it is possible to refer to substrings by either giving a byte + * offset or by counting whole characters, these naming conventions + * are helpful: + * + * - Labels called [range_pos] and [range_len] refer to byte positions of + * characters, or substrings + * - Labels called [count] refer to positions given as the number of characters + * relative to an origin + * + * Furthermore: + * + * - A [uchar] is a single Unicode code point represented as int + * - A [ustring] is a string of encoded characters + * - A [uarray] is an [array of int] representing a string + *) + +exception Malformed_code +(** Raised when an illegal byte sequence is found *) + +exception Cannot_represent of int +(** Raised when a certain Unicode code point cannot be represented in + * the selected output encoding + *) + +type encoding = + [ `Enc_utf8 (* UTF-8 *) + | `Enc_utf8_opt_bom + | `Enc_java (* The variant of UTF-8 used by Java *) + | `Enc_utf16 (* UTF-16 with unspecified endianess (restricted usage) *) + | `Enc_utf16_le (* UTF-16 little endian *) + | `Enc_utf16_be (* UTF-16 big endian *) + | `Enc_utf32 (* UTF-32 with unspecified endianess (restricted usage) *) + | `Enc_utf32_le (* UTF-32 little endian *) + | `Enc_utf32_be (* UTF-32 big endian *) + | `Enc_usascii (* US-ASCII (only 7 bit) *) + | `Enc_iso88591 (* ISO-8859-1 *) + | `Enc_iso88592 (* ISO-8859-2 *) + | `Enc_iso88593 (* ISO-8859-3 *) + | `Enc_iso88594 (* ISO-8859-4 *) + | `Enc_iso88595 (* ISO-8859-5 *) + | `Enc_iso88596 (* ISO-8859-6 *) + | `Enc_iso88597 (* ISO-8859-7 *) + | `Enc_iso88598 (* ISO-8859-8 *) + | `Enc_iso88599 (* ISO-8859-9 *) + | `Enc_iso885910 (* ISO-8859-10 *) + | `Enc_iso885911 (* ISO-8859-11 *) + | `Enc_iso885913 (* ISO-8859-13 *) + | `Enc_iso885914 (* ISO-8859-14 *) + | `Enc_iso885915 (* ISO-8859-15 *) + | `Enc_iso885916 (* ISO-8859-16 *) + | `Enc_koi8r (* KOI8-R *) + | `Enc_jis0201 (* JIS-X-0201 (Roman in lower half; Katakana upper half *) + | `Enc_eucjp (* EUC-JP (includes US-ASCII, JIS-X-0201, -0208, -0212) *) + | (* Japanese, TODO: *) + (*| `Enc_iso2022jp of jis_state = [ `Enc_usascii | `Enc_jis0201 | + `Enc_jis0208_1978 | `Enc_jis0208_1893 ] + It is very likely that ISO-2022 will be handled in a different module. + This encoding is too weird. + | `Enc_sjis + *) + `Enc_euckr + (* EUC-KR (includes US-ASCII, KS-X-1001) *) + | (* Older standards: *) + `Enc_asn1_iso646 + (* only the language-neutral subset - "IA5String" *) + | `Enc_asn1_T61 (* ITU T.61 ("Teletex") *) + | `Enc_asn1_printable (* ASN.1 Printable *) + | (* Microsoft: *) + `Enc_windows1250 (* WINDOWS-1250 *) + | `Enc_windows1251 (* WINDOWS-1251 *) + | `Enc_windows1252 (* WINDOWS-1252 *) + | `Enc_windows1253 (* WINDOWS-1253 *) + | `Enc_windows1254 (* WINDOWS-1254 *) + | `Enc_windows1255 (* WINDOWS-1255 *) + | `Enc_windows1256 (* WINDOWS-1256 *) + | `Enc_windows1257 (* WINDOWS-1257 *) + | `Enc_windows1258 (* WINDOWS-1258 *) + | (* IBM, ASCII-based: *) + `Enc_cp437 + | `Enc_cp737 + | `Enc_cp775 + | `Enc_cp850 + | `Enc_cp852 + | `Enc_cp855 + | `Enc_cp856 + | `Enc_cp857 + | `Enc_cp860 + | `Enc_cp861 + | `Enc_cp862 + | `Enc_cp863 + | `Enc_cp864 + | `Enc_cp865 + | `Enc_cp866 + | `Enc_cp869 + | `Enc_cp874 + | `Enc_cp1006 + | (* IBM, EBCDIC-based: *) + `Enc_cp037 + | `Enc_cp424 + | `Enc_cp500 + | `Enc_cp875 + | `Enc_cp1026 + | `Enc_cp1047 + | (* Adobe: *) + `Enc_adobe_standard_encoding + | `Enc_adobe_symbol_encoding + | `Enc_adobe_zapf_dingbats_encoding + | (* Apple: *) + `Enc_macroman + | (* Encoding subset: *) + `Enc_subset of encoding * (int -> bool) + | `Enc_empty (* does not encode any character *) ] +(** The polymorphic variant enumerating the supported encodings. We have: + * - [`Enc_utf8]: UTF-8 + * - [`Enc_utf8_opt_bom]: UTF-8 with an optional byte order mark at the + * beginning of the text + * - [`Enc_java]: The UTF-8 variant used by Java (the only difference is + * the representation of NUL) + * - [`Enc_utf16]: UTF-16 with unspecified endianess (restricted) + * - [`Enc_utf16_le]: UTF-16 little endian + * - [`Enc_utf16_be]: UTF-16 big endian + * - [`Enc_utf32]: UTF-32 with unspecified endianess (restricted) + * - [`Enc_utf32_le]: UTF-32 little endian + * - [`Enc_utf32_be]: UTF-32 big endian + * - [`Enc_usascii]: US-ASCII (7 bits) + * - [`Enc_iso8859]{i n}: ISO-8859-{i n} + * - [`Enc_koi8r]: KOI8-R + * - [`Enc_jis0201]: JIS-X-0201 (Roman and Katakana) + * - [`Enc_eucjp]: EUC-JP (code points from US-ASCII, JIS-X-0202, -0208, and + * -0212) + * - [`Enc_euckr]: EUC-KR (code points from US-ASCII, KS-X-1001) + * - [`Enc_windows]{i n}: WINDOWS-{i n} + * - [`Enc_cp]{i n}: IBM code page {i n}. Note that there are both ASCII- + * and EBCDIC-based code pages + * - [`Enc_adobe_*]: Adobe-specific encodings, e.g. used in Adobe fonts + * - [`Enc_mac*]: Macintosh-specific encodings + * - [`Enc_subset(e,def)]: The subset of [e] by applying the definition + * function [def] + * - [`Enc_empty]: The empty encoding (does not represent any character) + *) + +(**********************************************************************) +(* String functions *) +(**********************************************************************) + +(** {2:unicode_functions Unicode String Functions} *) + +val uarray_of_ustring : + encoding -> ?range_pos:int -> ?range_len:int -> string -> int array +(** Returns the characters of the string as array of Unicode code points. + * + * @param range_pos The byte position of the substring to extract + * (default: 0) + * @param range_len The byte length of the substring to extract + * (default: byte length of the input string minus [range_pos]) + *) + +val ustring_of_uarray : + ?subst:(int -> string) -> + encoding -> + ?pos:int -> + ?len:int -> + int array -> + string +(** Returns the array of Unicode code points as encoded string. + * + * @param pos Selects a subarray: [pos] is the first array position + * to encode (default: 0) + * @param len Selects a subarray: [len] is the length of the subarray + * to encode (default: array length minus [pos]) + * @param subst This function is called when a code point cannot be represented + * in the chosen character encoding. It must returns the (already encoded) + * string to substitute for this code point. By default + * (if ~subst is not passed), the exception [Cannot_represent] + * will be raised in this case. + *) + +val convert_poly : + in_ops:'s1 Netstring_tstring.tstring_ops -> + out_kind:'s2 Netstring_tstring.tstring_kind -> + ?subst:(int -> string) -> + in_enc:encoding -> + out_enc:encoding -> + ?range_pos:int -> + ?range_len:int -> + 's1 -> + 's2 + +val convert : + ?subst:(int -> string) -> + in_enc:encoding -> + out_enc:encoding -> + ?range_pos:int -> + ?range_len:int -> + string -> + string +(** Converts the string from [in_enc] to [out_enc], and returns it. +* The string must consist of a whole number of characters. If it +* ends with an incomplete multi-byte character, however, this is +* detected, and the exception [Malformed_code] will be raised. +* This exception is also raised for other encoding errors in the +* input string. +* +* @param subst This function is invoked for code points of [in_enc] that +* cannot be represented in [out_enc], and the result of the function +* invocation is substituted (directly, without any further conversion). +* Restriction: The string returned by [subst] must not be longer than 50 +* bytes. +* If [subst] is missing, [Cannot_represent] is raised in this case. +* +* @param range_pos Selects a substring for conversion. [range_pos] +* is the byte position of the first character of the substring. +* (Default: 0) +* +* @param range_len Selects a substring for conversion. [range_len] +* is the length of the substring in bytes (Default: Length +* of the input string minus [range_pos]) +*) + +val is_ascii_compatible : encoding -> bool +(** "ASCII compatible" means: The bytes 1 to 127 represent the ASCII + * codes 1 to 127, and no other representation of a character contains + * the bytes 1 to 127. + * + * For example, ISO-8859-1 is ASCII-compatible because the byte 1 to + * 127 mean the same as in ASCII, and all other characters use bytes + * greater than 127. UTF-8 is ASCII-compatible for the same reasons, + * it does not matter that there are multi-byte characters. + * EBCDIC is not ASCII-compatible because the bytes 1 to 127 do not mean + * the same as in ASCII. UTF-16 is not ASCII-compatible because the bytes + * 1 to 127 can occur in multi-byte representations of non-ASCII + * characters. + * + * The byte 0 has been excluded from this definition because the C + * language uses it with a special meaning that has nothing to do with + * characters, so it is questionable to interpret the byte 0 anyway. + *) + +val is_single_byte : encoding -> bool +(** Returns whether the encoding is a single-byte encoding *) + +val makechar : encoding -> int -> string +(** [makechar enc i:] + * Creates the string representing the Unicode code point [i] in encoding + * [enc]. Raises [Not_found] if the character is legal but cannot be + * represented in [enc]. + * + * Possible encodings: everything but [`Enc_utf16] and [`Enc_utf32] + * + * Evaluation hints: + * - PRE_EVAL(encoding) + * + * @deprecated This function is deprecated since ocamlnet-0.96. Use + * [ustring_of_uchar] instead. + *) diff --git a/ocamlnet_lite/netdb.ml b/ocamlnet_lite/netdb.ml new file mode 100644 index 0000000..29bed6b --- /dev/null +++ b/ocamlnet_lite/netdb.ml @@ -0,0 +1,34 @@ +let values = Hashtbl.create 13 +let loaders = Hashtbl.create 13 +let cksums = Hashtbl.create 13 +let enabled = ref true + +let read_db name = + let v = + try Hashtbl.find values name + with Not_found -> + if not !enabled then + failwith + ("Ocamlnet: The lookup table `" ^ name + ^ "' is not compiled into the program, and access to " + ^ "the external file database is disabled"); + let loader = + try Hashtbl.find loaders name + with Not_found -> failwith ("Ocamlnet: No such lookup table: " ^ name) + in + loader name + in + try + let cksum = Hashtbl.find cksums name in + if Digest.string v <> cksum then + failwith ("Netdb: checksum error for table: " ^ name); + v + with Not_found -> v + +let exists_db name = + Hashtbl.mem values name || (!enabled && Hashtbl.mem loaders name) + +let set_db name value = Hashtbl.replace values name value +let set_db_checksum name cksum = Hashtbl.replace cksums name cksum +let set_db_loader name loader = Hashtbl.replace loaders name loader +let enable_db_loaders b = enabled := b diff --git a/ocamlnet_lite/netdb.mli b/ocamlnet_lite/netdb.mli new file mode 100644 index 0000000..4559185 --- /dev/null +++ b/ocamlnet_lite/netdb.mli @@ -0,0 +1,38 @@ +(* This is an internal interface of ocamlnet! Do not use outside! *) + +(* This module manages persistent values (often lookup tables). These + * values can be stored in external files, or they can be initialized + * from string values. + *) + +val read_db : string -> string + (* Reads the value with the given name, and returns it. + * + * First it is checked whether there was a set_db call, and if so, + * this value is unmarshalled and returned. Otherwise, it is checked + * whether there is a loader, and if so, it is called. + * + * In both cases the checksum is checked. + *) + +val exists_db : string -> bool + (* Checks whether the named value is available, i.e. read_db would + * be able to find it + *) + +val set_db_checksum : string -> string -> unit + (* [set_db_checksum key cksum]: sets the MD5 digest of this key *) + +val set_db : string -> string -> unit + (* Sets the persistent value with the given name (1st arg) to the + * passed value (2nd arg). The value must be marshalled as string. + *) + +val set_db_loader : string -> (string -> string) -> unit + (* [set_db_loader key loader]: sets a loader for this key, which is called + when set_db has not been set for this key. The arg of the loader is the + key. + *) + +val enable_db_loaders : bool -> unit + (* Whether dynamic loading is enabled *) diff --git a/ocamlnet_lite/netencoding.ml b/ocamlnet_lite/netencoding.ml new file mode 100644 index 0000000..4094163 --- /dev/null +++ b/ocamlnet_lite/netencoding.ml @@ -0,0 +1,661 @@ +module Url = struct + (* adapted from https://gitlab.com/gerdstolpmann/lib-ocamlnet3/-/blob/4d1a8401bd40c17632128545e2aa4c880535e208/code/src/netstring/netencoding.ml#L993 *) + let hex_digits = + [| + '0'; + '1'; + '2'; + '3'; + '4'; + '5'; + '6'; + '7'; + '8'; + '9'; + 'A'; + 'B'; + 'C'; + 'D'; + 'E'; + 'F'; + |] + + let to_hex2 k = + (* Converts k to a 2-digit hex string *) + let s = Bytes.create 2 in + Bytes.set s 0 hex_digits.((k lsr 4) land 15); + Bytes.set s 1 hex_digits.(k land 15); + Bytes.unsafe_to_string s + + let of_hex1 c = + match c with + | '0' .. '9' -> Char.code c - Char.code '0' + | 'A' .. 'F' -> Char.code c - Char.code 'A' + 10 + | 'a' .. 'f' -> Char.code c - Char.code 'a' + 10 + | _ -> raise Not_found + + let url_encoding_re = Netstring_str.regexp "[^A-Za-z0-9_.!*-]" + let url_decoding_re = Netstring_str.regexp "\\+\\|%..\\|%.\\|%" + + let encode ?(plus = true) s = + Netstring_str.global_substitute url_encoding_re + (fun r _ -> + match Netstring_str.matched_string r s with + | " " when plus -> "+" + | x -> + let k = Char.code x.[0] in + "%" ^ to_hex2 k) + s + + let decode ?(plus = true) ?(pos = 0) ?len s = + let s_l = String.length s in + let s1 = + if pos = 0 && len = None then s + else + let len = match len with Some n -> n | None -> s_l in + String.sub s pos len + in + let l = String.length s1 in + Netstring_str.global_substitute url_decoding_re + (fun r _ -> + match Netstring_str.matched_string r s1 with + | "+" -> if plus then " " else "+" + | _ -> ( + let i = Netstring_str.match_beginning r in + (* Assertion: s1.[i] = '%' *) + if i + 2 >= l then failwith "Web.Url.decode"; + let c1 = s1.[i + 1] in + let c2 = s1.[i + 2] in + try + let k1 = of_hex1 c1 in + let k2 = of_hex1 c2 in + String.make 1 (Char.chr ((k1 lsl 4) lor k2)) + with Not_found -> failwith "Web.Url.decode")) + s1 + + let url_split_re = Netstring_str.regexp "[&=]" + + let dest_url_encoded_parameters parstr = + let rec parse_after_amp tl = + match tl with + | Netstring_str.Text name + :: Netstring_str.Delim "=" + :: Netstring_str.Text value + :: tl' -> + (decode name, decode value) :: parse_next tl' + | Netstring_str.Text name + :: Netstring_str.Delim "=" + :: Netstring_str.Delim "&" + :: tl' -> + (decode name, "") :: parse_after_amp tl' + | [ Netstring_str.Text name; Netstring_str.Delim "=" ] -> + [ (decode name, "") ] + | _ -> failwith "Web.Url.dest_url_encoded_parameters" + and parse_next tl = + match tl with + | [] -> [] + | Netstring_str.Delim "&" :: tl' -> parse_after_amp tl' + | _ -> failwith "Web.Url.dest_url_encoded_parameters" + in + let toklist = Netstring_str.full_split url_split_re parstr in + match toklist with [] -> [] | _ -> parse_after_amp toklist +end + +module Html = struct + let etable = + [ + ("lt", 60); + ("gt", 62); + ("amp", 38); + ("quot", 34); + (* Note: " is new in HTML-4.0, but it has been widely used + * much earlier. + *) + ("apos", 39); + (* Only used if contained in unsafe_chars *) + (* ISO-8859-1: *) + ("nbsp", 160); + ("iexcl", 161); + ("cent", 162); + ("pound", 163); + ("curren", 164); + ("yen", 165); + ("brvbar", 166); + ("sect", 167); + ("uml", 168); + ("copy", 169); + ("ordf", 170); + ("laquo", 171); + ("not", 172); + ("shy", 173); + ("reg", 174); + ("macr", 175); + ("deg", 176); + ("plusmn", 177); + ("sup2", 178); + ("sup3", 179); + ("acute", 180); + ("micro", 181); + ("para", 182); + ("middot", 183); + ("cedil", 184); + ("sup1", 185); + ("ordm", 186); + ("raquo", 187); + ("frac14", 188); + ("frac12", 189); + ("frac34", 190); + ("iquest", 191); + ("Agrave", 192); + ("Aacute", 193); + ("Acirc", 194); + ("Atilde", 195); + ("Auml", 196); + ("Aring", 197); + ("AElig", 198); + ("Ccedil", 199); + ("Egrave", 200); + ("Eacute", 201); + ("Ecirc", 202); + ("Euml", 203); + ("Igrave", 204); + ("Iacute", 205); + ("Icirc", 206); + ("Iuml", 207); + ("ETH", 208); + ("Ntilde", 209); + ("Ograve", 210); + ("Oacute", 211); + ("Ocirc", 212); + ("Otilde", 213); + ("Ouml", 214); + ("times", 215); + ("Oslash", 216); + ("Ugrave", 217); + ("Uacute", 218); + ("Ucirc", 219); + ("Uuml", 220); + ("Yacute", 221); + ("THORN", 222); + ("szlig", 223); + ("agrave", 224); + ("aacute", 225); + ("acirc", 226); + ("atilde", 227); + ("auml", 228); + ("aring", 229); + ("aelig", 230); + ("ccedil", 231); + ("egrave", 232); + ("eacute", 233); + ("ecirc", 234); + ("euml", 235); + ("igrave", 236); + ("iacute", 237); + ("icirc", 238); + ("iuml", 239); + ("eth", 240); + ("ntilde", 241); + ("ograve", 242); + ("oacute", 243); + ("ocirc", 244); + ("otilde", 245); + ("ouml", 246); + ("divide", 247); + ("oslash", 248); + ("ugrave", 249); + ("uacute", 250); + ("ucirc", 251); + ("uuml", 252); + ("yacute", 253); + ("thorn", 254); + ("yuml", 255); + (* Other: *) + ("fnof", 402); + ("Alpha", 913); + ("Beta", 914); + ("Gamma", 915); + ("Delta", 916); + ("Epsilon", 917); + ("Zeta", 918); + ("Eta", 919); + ("Theta", 920); + ("Iota", 921); + ("Kappa", 922); + ("Lambda", 923); + ("Mu", 924); + ("Nu", 925); + ("Xi", 926); + ("Omicron", 927); + ("Pi", 928); + ("Rho", 929); + ("Sigma", 931); + ("Tau", 932); + ("Upsilon", 933); + ("Phi", 934); + ("Chi", 935); + ("Psi", 936); + ("Omega", 937); + ("alpha", 945); + ("beta", 946); + ("gamma", 947); + ("delta", 948); + ("epsilon", 949); + ("zeta", 950); + ("eta", 951); + ("theta", 952); + ("iota", 953); + ("kappa", 954); + ("lambda", 955); + ("mu", 956); + ("nu", 957); + ("xi", 958); + ("omicron", 959); + ("pi", 960); + ("rho", 961); + ("sigmaf", 962); + ("sigma", 963); + ("tau", 964); + ("upsilon", 965); + ("phi", 966); + ("chi", 967); + ("psi", 968); + ("omega", 969); + ("thetasym", 977); + ("upsih", 978); + ("piv", 982); + ("bull", 8226); + ("hellip", 8230); + ("prime", 8242); + ("Prime", 8243); + ("oline", 8254); + ("frasl", 8260); + ("weierp", 8472); + ("image", 8465); + ("real", 8476); + ("trade", 8482); + ("alefsym", 8501); + ("larr", 8592); + ("uarr", 8593); + ("rarr", 8594); + ("darr", 8595); + ("harr", 8596); + ("crarr", 8629); + ("lArr", 8656); + ("uArr", 8657); + ("rArr", 8658); + ("dArr", 8659); + ("hArr", 8660); + ("forall", 8704); + ("part", 8706); + ("exist", 8707); + ("empty", 8709); + ("nabla", 8711); + ("isin", 8712); + ("notin", 8713); + ("ni", 8715); + ("prod", 8719); + ("sum", 8721); + ("minus", 8722); + ("lowast", 8727); + ("radic", 8730); + ("prop", 8733); + ("infin", 8734); + ("ang", 8736); + ("and", 8743); + ("or", 8744); + ("cap", 8745); + ("cup", 8746); + ("int", 8747); + ("there4", 8756); + ("sim", 8764); + ("cong", 8773); + ("asymp", 8776); + ("ne", 8800); + ("equiv", 8801); + ("le", 8804); + ("ge", 8805); + ("sub", 8834); + ("sup", 8835); + ("nsub", 8836); + ("sube", 8838); + ("supe", 8839); + ("oplus", 8853); + ("otimes", 8855); + ("perp", 8869); + ("sdot", 8901); + ("lceil", 8968); + ("rceil", 8969); + ("lfloor", 8970); + ("rfloor", 8971); + ("lang", 9001); + ("rang", 9002); + ("loz", 9674); + ("spades", 9824); + ("clubs", 9827); + ("hearts", 9829); + ("diams", 9830); + ("OElig", 338); + ("oelig", 339); + ("Scaron", 352); + ("scaron", 353); + ("Yuml", 376); + ("circ", 710); + ("tilde", 732); + ("ensp", 8194); + ("emsp", 8195); + ("thinsp", 8201); + ("zwnj", 8204); + ("zwj", 8205); + ("lrm", 8206); + ("rlm", 8207); + ("ndash", 8211); + ("mdash", 8212); + ("lsquo", 8216); + ("rsquo", 8217); + ("sbquo", 8218); + ("ldquo", 8220); + ("rdquo", 8221); + ("bdquo", 8222); + ("dagger", 8224); + ("Dagger", 8225); + ("permil", 8240); + ("lsaquo", 8249); + ("rsaquo", 8250); + ("euro", 8364); + ] + + let quick_etable_html = + let ht = Hashtbl.create 50 in + List.iter (fun (name, value) -> Hashtbl.add ht name value) etable; + ht + + let quick_etable_xml = + let ht = Hashtbl.create 5 in + List.iter + (fun name -> + let value = List.assoc name etable in + Hashtbl.add ht name value) + [ "lt"; "gt"; "amp"; "quot"; "apos" ]; + ht + + let rev_etable = + (* Only code points 0 to 255: *) + let a = Array.make 256 "" in + List.iter + (fun (name, value) -> if value <= 255 then a.(value) <- "&" ^ name ^ ";") + etable; + a + + let rev_etable_rest = + (* Only code points >= 256: *) + let ht = Hashtbl.create 150 in + List.iter + (fun (name, value) -> + if value >= 256 then Hashtbl.add ht value ("&" ^ name ^ ";")) + etable; + ht + + let unsafe_chars_html4 = + "<>\"&\000\001\002\003\004\005\006\007\008\011\012\014\015\016\017\018\019\020\021\022\023\024\025\026\027\028\029\030\031\127" + + let regexp_ht = Hashtbl.create 7 + + let regexp_set s = + try Hashtbl.find regexp_ht s + with Not_found -> + let re = Netstring_str.regexp (Netstring_str.quote_set s) in + if Hashtbl.length regexp_ht < 100 then + (* avoid leak *) + Hashtbl.replace regexp_ht s re; + re + + (* The functions [encode_quickly] and [encode_ascii] are special cases of + * [encode] that can be implemented by regular expressions. + *) + + let encode_quickly ~prefer_name ~unsafe_chars () = + (* Preconditions: in_enc = out_enc, and the encoding must be a single-byte, + * ASCII-compatible encoding. + *) + if unsafe_chars = "" then fun s -> s + else + let unsafe_re = regexp_set unsafe_chars in + Netstring_str.global_substitute unsafe_re (fun r s -> + let t = Netstring_str.matched_string r s in + let p = Char.code t.[0] in + (* p is an ASCII code point *) + let name = rev_etable.(p) in + if prefer_name && name <> "" then name + else "&#" ^ string_of_int p ^ ";") + + let encode_quickly_poly ~prefer_name ~unsafe_chars ~ops ~out_kind () = + Netstring_tstring.polymorph_string_transformation + (encode_quickly ~prefer_name ~unsafe_chars ()) + ops out_kind + + let msb_set = + let s = Bytes.create 128 in + for k = 0 to 127 do + Bytes.set s k (Char.chr (128 + k)) + done; + Bytes.unsafe_to_string s + + let encode_ascii ~in_enc ~prefer_name ~unsafe_chars () = + (* Preconditions: out_enc = `Enc_usascii, and in_enc must be a single-byte, + * ASCII-compatible encoding. + *) + let unsafe_chars1 = unsafe_chars ^ msb_set in + let unsafe_re = regexp_set unsafe_chars1 in + (* unicode_of.[q] = p: the code point q+128 of in_enc is the same as the + * Unicode code point p + *) + let unicode_of = Array.make 128 (-1) in + for i = 0 to 127 do + try + let s = String.make 1 (Char.chr (i + 128)) in + let u = Netconversion.uarray_of_ustring in_enc s in + match u with [| u0 |] -> unicode_of.(i) <- u0 | _ -> assert false + with Netconversion.Malformed_code -> unicode_of.(i) <- -1 + done; + Netstring_str.global_substitute unsafe_re (fun r s -> + let t = Netstring_str.matched_string r s in + (* p is the code point in the encoding ~in_enc; p' is the Unicode + * code point: + *) + let p = Char.code t.[0] in + let p' = if p < 128 then p else unicode_of.(p - 128) in + if p' < 0 then raise Netconversion.Malformed_code; + let name = + if prefer_name then + if p' <= 255 then rev_etable.(p') + else try Hashtbl.find rev_etable_rest p' with Not_found -> "" + else "" + in + if name = "" then "&#" ^ string_of_int p' ^ ";" else name) + + let encode_ascii_poly ~in_enc ~prefer_name ~unsafe_chars ~ops ~out_kind () = + Netstring_tstring.polymorph_string_transformation + (encode_ascii ~in_enc ~prefer_name ~unsafe_chars ()) + ops out_kind + + let encode_poly ~in_enc ~in_ops ~out_kind ?(out_enc = `Enc_usascii) + ?(prefer_name = true) ?(unsafe_chars = unsafe_chars_html4) () = + (* This function implements the general case *) + (* Check arguments: *) + if not (Netconversion.is_ascii_compatible out_enc) then + invalid_arg "Netencoding.Html.encode: out_enc not ASCII-compatible"; + for i = 0 to String.length unsafe_chars - 1 do + if Char.code unsafe_chars.[i] >= 128 then + invalid_arg + "Netencoding.Html.encode: non-ASCII character in unsafe_chars" + done; + (* Are there better implementations than the general one? *) + let in_single = Netconversion.is_single_byte in_enc in + let in_subset = + match in_enc with `Enc_subset (_, _) -> true | _ -> false + in + if (not in_subset) && in_enc = out_enc && in_single then + encode_quickly_poly ~prefer_name ~unsafe_chars ~ops:in_ops ~out_kind () + else if (not in_subset) && out_enc = `Enc_usascii && in_single then + encode_ascii_poly ~in_enc ~prefer_name ~unsafe_chars ~ops:in_ops ~out_kind + () + else + (* ... only the general implementation is applicable. *) + (* Create the domain function: *) + let dom_array = Array.make 128 true in + let dom p = p >= 128 || dom_array.(p) in + (* Set dom_array from unsafe_chars: *) + for i = 0 to String.length unsafe_chars - 1 do + let c = Char.code unsafe_chars.[i] in + dom_array.(c) <- false + done; + (* Create the substitution function: *) + let subst p = + let name = + if prefer_name then + if p <= 255 then rev_etable.(p) + else try Hashtbl.find rev_etable_rest p with Not_found -> "" + else "" + in + if name = "" then "&#" ^ string_of_int p ^ ";" else name + in + (* Recode: *) + fun s -> + Netconversion.convert_poly ~in_ops ~out_kind ~subst ~in_enc + ~out_enc:(`Enc_subset (out_enc, dom)) + s + + let encode ~in_enc ?out_enc ?prefer_name ?unsafe_chars () = + let in_ops = Netstring_tstring.string_ops in + let out_kind = Netstring_tstring.String_kind in + encode_poly ~in_enc ~in_ops ~out_kind ?out_enc ?prefer_name ?unsafe_chars () + + type entity_set = [ `Html | `Xml | `Empty ] + + let eref_re = + Netstring_str.regexp + "&\\(#\\([0-9]+\\);\\|#[xX]\\([0-9a-fA-F]+\\);\\|\\([a-zA-Z]+\\);\\)" + + let total_enc = + (* every byte must have a corresponding Unicode code point, i.e. the + * encoding must be "byte-total" + *) + function + | `Enc_iso88591 | `Enc_iso88592 | `Enc_iso88593 | `Enc_iso88594 + | `Enc_iso88595 | `Enc_iso88599 | `Enc_iso885910 | `Enc_iso885913 + | `Enc_iso885914 | `Enc_iso885915 | `Enc_iso885916 -> + true + | _ -> false + + let hex_digit_of_char c = + match c with + | '0' .. '9' -> Char.code c - 48 + | 'A' .. 'F' -> Char.code c - 55 + | 'a' .. 'f' -> Char.code c - 87 + | _ -> assert false + + let hex_of_string s = + let n = ref 0 in + for i = 0 to String.length s - 1 do + let d = hex_digit_of_char s.[i] in + n := (!n lsl 4) lor d + done; + !n + + let search_all re s pos = + let rec search p acc = + match + try Some (Netstring_str.search_forward re s p) with Not_found -> None + with + | Some (k, r) -> search (k + 1) ((k, r) :: acc) + | None -> List.rev acc + in + search pos [] + + let decode_half_poly ~in_enc ~out_kind ~out_enc + ?(lookup = + fun name -> + failwith ("Netencoding.Html.decode: Unknown entity `" ^ name ^ "'")) + ?(subst = + fun p -> + failwith + ("Netencoding.Html.decode: Character cannot be represented: " + ^ string_of_int p)) ?(entity_base = (`Html : entity_set)) () = + (* Argument checks: *) + if not (Netconversion.is_ascii_compatible in_enc) then + invalid_arg "Netencoding.Html.decode: in_enc not ASCII-compatible"; + (* makechar: *) + let raw_makechar = Netconversion.makechar out_enc in + let makechar p = try raw_makechar p with Not_found -> subst p in + (* Entity lookup: *) + let lookup_entity = + match entity_base with + | `Html | `Xml -> ( + let ht = + if entity_base = `Html then quick_etable_html else quick_etable_xml + in + fun name -> + try makechar (Hashtbl.find ht name) with Not_found -> lookup name) + | `Empty -> lookup + in + (* Recode strings: *) + let recode_str = + if total_enc in_enc && in_enc = out_enc then fun s pos len -> + if pos = 0 && len = String.length s then s else String.sub s pos len + else fun s range_pos range_len -> + Netconversion.convert ~in_enc ~out_enc ~subst ~range_pos ~range_len s + in + fun s -> + (* Find all occurrences of &name; or &#num; or &#xnum; *) + let occurrences = search_all eref_re s 0 in + (* Collect the resulting string in a buffer *) + let buf = Netbuffer.create 250 in + let n = ref 0 in + List.iter + (fun (n0, r) -> + let n1 = Netstring_str.match_end r in + if n0 > !n then Netbuffer.add_string buf (recode_str s !n (n0 - !n)); + let replacement = + let num = + try Netstring_str.matched_group r 2 s with Not_found -> "" + in + (* Note: Older versions of Pcre return "" when the substring + * did not match, newer versions raise Not_found + *) + if num <> "" then + let n = int_of_string num in + makechar n + else + let xnum = + try Netstring_str.matched_group r 3 s with Not_found -> "" + in + (* Note: Older versions of Pcre return "" when the substring + * did not match, newer versions raise Not_found + *) + if xnum <> "" then + let n = hex_of_string xnum in + makechar n + else + let name = + try Netstring_str.matched_group r 4 s with Not_found -> "" + in + (* Note: Older versions of Pcre return "" when the substring + * did not match, newer versions raise Not_found + *) + assert (name <> ""); + lookup_entity name + in + Netbuffer.add_string buf replacement; + n := n1) + occurrences; + let n0 = String.length s in + if n0 > !n then Netbuffer.add_string buf (recode_str s !n (n0 - !n)); + (* Return *) + Netbuffer.to_tstring_poly buf out_kind + + let decode ~in_enc ~out_enc ?lookup ?subst ?entity_base () = + let out_kind = Netstring_tstring.String_kind in + decode_half_poly ~in_enc ~out_kind ~out_enc ?lookup ?subst ?entity_base () + +end diff --git a/ocamlnet_lite/netencoding.mli b/ocamlnet_lite/netencoding.mli new file mode 100644 index 0000000..0969e7d --- /dev/null +++ b/ocamlnet_lite/netencoding.mli @@ -0,0 +1,137 @@ +(* *********************************************************************) +(* HTMLization *) +(* *********************************************************************) + +(* THREAD-SAFETY: + * The Html functions are thread-safe. + *) + +module Url : sig +(** Encoding/Decoding within URLs: + * + * The following two functions perform the '%'-substitution for + * characters that may otherwise be interpreted as metacharacters. + * + * According to: RFC 1738, RFC 1630 + * + * Option [plus]: This option has been added because there are some + * implementations that do not map ' ' to '+', for example Javascript's + * [escape] function. The default is [true] because this is the RFC- + * compliant definition. + *) + +(** There are no tstring and polymorphic versions of the encode and + decode functions, as URLs are comparatively short, and it is + considered as acceptable for the user to convert types as needed, + even if strings need to be copied for that. +*) + + val decode : ?plus:bool -> ?pos:int -> ?len:int -> string -> string + (** Option [plus]: Whether '+' is converted to space. The default + * is true. If false, '+' is returned as it is. + * + * The optional arguments [pos] and [len] may restrict the string + * to process to this substring. + *) + + val encode : ?plus:bool -> string -> string + (** Option [plus]: Whether spaces are converted to '+'. The default + * is true. If false, spaces are converted to "%20", and + * only %xx sequences are produced. + *) + + val dest_url_encoded_parameters : string -> (string * string) list + (** The argument is the URL-encoded parameter string. The result is + * the corresponding list of (name,value) pairs. + * Note: Whitespace within the parameter string is ignored. + * If there is a format error, the function fails. + *) +end + +module Html : sig + (** Encodes characters that need protection by converting them to + * entity references. E.g. ["<"] is converted to ["<"]. + * As the entities may be named, there is a dependency on the character + * set. + *) + + val encode : + in_enc:Netconversion.encoding -> + ?out_enc:Netconversion.encoding -> + (* default: `Enc_usascii *) + ?prefer_name:bool -> + (* default: true *) + ?unsafe_chars:string -> + (* default: unsafe_chars_html4 *) + unit -> + string -> + string + (** The input string that is encoded as [in_enc] is recoded to + * [out_enc], and the following characters are encoded as HTML + * entity ([&name;] or [&#num;]): + * - The ASCII characters contained in [unsafe_chars] + * - The characters that cannot be represented in [out_enc]. By + * default ([out_enc=`Enc_usascii]), only ASCII characters can be + * represented, and thus all code points >= 128 are encoded as + * HTML entities. If you pass [out_enc=`Enc_utf8], all characters + * can be represented. + * + * For example, the string ["(ad)"] is encoded as + * ["(a<b) & (c>d)"]. + * + * It is required that [out_enc] is an ASCII-compatible encoding. + * + * The option [prefer_name] selects whether named entities (e.g. [<]) + * or numeric entities (e.g. [<]) are prefered. + * + * The efficiency of the function can be improved when the same encoding + * is applied to several strings. Create a specialized encoding function + * by passing all arguments up to the unit argument, and apply this + * function several times. For example: + * {[ + * let my_enc = encode ~in_enc:`Enc_utf8 () in + * let s1' = my_enc s1 in + * let s2' = my_enc s2 in ... + * ]} + *) + + type entity_set = [ `Html | `Xml | `Empty ] + + val decode : + in_enc:Netconversion.encoding -> + out_enc:Netconversion.encoding -> + ?lookup:(string -> string) -> + (* default: see below *) + ?subst:(int -> string) -> + (* default: see below *) + ?entity_base:entity_set -> + (* default: `Html *) + unit -> + string -> + string + (** The input string is recoded from [in_enc] to [out_enc], and HTML + * entities ([&name;] or [&#num;]) are resolved. The input encoding + * [in_enc] must be ASCII-compatible. + * + * By default, the function knows all entities defined for HTML 4 (this + * can be changed using [entity_base], see below). If other + * entities occur, the function [lookup] is called and the name of + * the entity is passed as input string to the function. It is + * expected that [lookup] returns the value of the entity, and that this + * value is already encoded as [out_enc]. + * By default, [lookup] raises a [Failure] exception. + * + * If a character cannot be represented in the output encoding, + * the function [subst] is called. [subst] must return a substitute + * string for the character. + * By default, [subst] raises a [Failure] exception. + * + * The option [entity_base] determines which set of entities are + * considered as the known entities that can be decoded without + * help by the [lookup] function: [`Html] selects all entities defined + * for HTML 4, [`Xml] selects only [<], [>], [&], ["], + * and ['], + * and [`Empty] selects the empty set (i.e. [lookup] is always called). + *) + +end diff --git a/ocamlnet_lite/netmappings.ml b/ocamlnet_lite/netmappings.ml new file mode 100644 index 0000000..e0f7995 --- /dev/null +++ b/ocamlnet_lite/netmappings.ml @@ -0,0 +1,39 @@ +(* $Id$ + * ---------------------------------------------------------------------- + * + *) + +type from_uni_list = + | U_nil + | U_single of (int * int) + | U_double of (int * int * int * int) + | U_array of int array + +let to_unicode = Hashtbl.create 50 +let from_unicode = Hashtbl.create 50 + +let get_to_unicode enc_name : int array = + try + let table = + try Hashtbl.find to_unicode enc_name + with Not_found -> + let t_str = Netdb.read_db ("cmapf." ^ enc_name) in + let t = Marshal.from_string t_str 0 in + Hashtbl.add to_unicode enc_name t; + t + in + table + with error -> raise error + +let get_from_unicode enc_name : from_uni_list array = + try + let table = + try Hashtbl.find from_unicode enc_name + with Not_found -> + let t_str = Netdb.read_db ("cmapr." ^ enc_name) in + let t = Marshal.from_string t_str 0 in + Hashtbl.add from_unicode enc_name t; + t + in + table + with error -> raise error diff --git a/ocamlnet_lite/netmappings.mli b/ocamlnet_lite/netmappings.mli new file mode 100644 index 0000000..d89c3d1 --- /dev/null +++ b/ocamlnet_lite/netmappings.mli @@ -0,0 +1,46 @@ +(* $Id$ + * ---------------------------------------------------------------------- + *) + +(** Internal access to the character conversion database + * + * This is an internal module. + *) + +type from_uni_list = + U_nil + | U_single of (int*int) + | U_double of (int*int * int*int) + | U_array of int array +;; + (* A representation of (int*int) list that is optimized for the case that + * lists with 0 and 1 and 2 elements are the most frequent cases. + *) + + +val get_to_unicode : string -> int array + +val get_from_unicode : string -> from_uni_list array + (* These functions get the conversion tables from local encodings to + * Unicode and vice versa. + * It is normally not necessary to access these tables; the + * Netconversion module does it already for you. + * + * The argument is the internal name of the encoding. (E.g. if + * encoding = `Enc_iso88591, the internal name is "iso88591", i.e. + * the "`Enc_" prefix is removed. However, for "composite encodings" + * like `Enc_eucjp things are more complicated.) + * + * Specification of the conversion tables: + * + * to_unicode: maps a local code to Unicode, i.e. + * let m = Hashtbl.find `Enc_isoXXX to_unicode in + * let unicode = m.(isocode) + * - This may be (-1) to indicate that the code point is not defined. + * + * from_unicode: maps Unicode to a local code, i.e. + * let m = Hashtbl.find `Enc_isoXXX from_unicode in + * let l = m.(unicode land mask) + * Now search in l the pair (unicode, isocode), and return isocode. + * Where mask = Array.length from_unicode - 1 + *) diff --git a/ocamlnet_lite/netstring_str.ml b/ocamlnet_lite/netstring_str.ml index 9449fe5..79b275f 100644 --- a/ocamlnet_lite/netstring_str.ml +++ b/ocamlnet_lite/netstring_str.ml @@ -1,5 +1,37 @@ open ExtLib +let explode s = + let l = String.length s in + let rec loop k = if k < l then s.[k] :: loop (k + 1) else [] in + loop 0 + +let implode l = + let n = List.length l in + let s = Bytes.create n in + let k = ref 0 in + List.iter + (fun c -> + Bytes.set s !k c; + incr k) + l; + Bytes.to_string s + +let quote_set s = + let l = explode s in + let have_circum = List.mem '^' l in + let have_minus = List.mem '-' l in + let have_rbracket = List.mem ']' l in + let l1 = List.filter (fun c -> c <> '^' && c <> '-' && c <> ']') l in + let l2 = if have_rbracket then ']' :: l1 else l1 in + let l3 = if have_circum then l2 @ [ '^' ] else l2 in + let l4 = if have_minus then l3 @ [ '-' ] else l3 in + let s4 = implode l4 in + match s4 with + | "" -> failwith "Netstring_str.quote_set: empty" + | "^" -> "^" + | "^-" -> "[-^]" + | _ -> "[" ^ s4 ^ "]" + type setatom = Schar of char | Srange of (char * char) and set = setatom list @@ -189,8 +221,7 @@ let scan_str_regexp re_string = (* the character after [ or [^ ? *) while !continue && !k < l do match () with - | () when !c = '[' && !k + 1 < l && re_string.[!k + 1] = ':' - -> + | () when !c = '[' && !k + 1 < l && re_string.[!k + 1] = ':' -> failwith "regexp: Character classes such as [[:digit:]] not \ implemented" @@ -286,6 +317,10 @@ let regexp s = let s' = print_pcre_regexp ret in Pcre.regexp ~flags:[ `MULTILINE ] s' +let search_forward pat s pos = + let result = Pcre.exec ~rex:pat ~pos s in + (fst (Pcre.get_substring_ofs result 0), result) + let matched_string result _ = (* Unfortunately, Pcre.get_substring will not raise Not_found if there is * no matched string. Instead, it returns "", but this value cannot be @@ -299,6 +334,13 @@ let matched_string result _ = Pcre.get_substring result 0 let match_beginning result = fst (Pcre.get_substring_ofs result 0) +let match_end result = snd (Pcre.get_substring_ofs result 0) + +let matched_group result n _ = + (* See also the comment for [matched_string] *) + if n < 0 || n >= Pcre.num_of_subs result then raise Not_found; + ignore (Pcre.get_substring_ofs result n); + Pcre.get_substring result n let global_substitute pat subst s = Pcre.substitute_substrings ~rex:pat ~subst:(fun r -> subst r s) s @@ -311,6 +353,4 @@ let tr_split_result r = (function Pcre.Group (_, _) | Pcre.NoGroup -> false | _ -> true) r) -let full_split sep s = - tr_split_result (Pcre.full_split ~rex:sep ~max:(-1) s) - +let full_split sep s = tr_split_result (Pcre.full_split ~rex:sep ~max:(-1) s) diff --git a/ocamlnet_lite/netstring_str.mli b/ocamlnet_lite/netstring_str.mli index e1eae0d..19762ce 100644 --- a/ocamlnet_lite/netstring_str.mli +++ b/ocamlnet_lite/netstring_str.mli @@ -11,6 +11,19 @@ type result val regexp : string -> regexp (** Parses a regexp *) +val search_forward : regexp -> string -> int -> int * result +(** Searches a match of the string with the regexp, starting at + * the position and in forward direction. + * Raises [Not_found] if no match could be found. + * Returns [(p,r)] when a match at position [p] is found, + * described by [r]. + *) + +val quote_set : string -> string +(** Returns a regexp (as string) that matches any of the characters in + the argument. The argument must be non-empty + *) + val matched_string : result -> string -> string (** Extracts the matched part from the string. The string argument * must be the same string passed to [string_match] or the search @@ -21,6 +34,17 @@ val matched_string : result -> string -> string val match_beginning : result -> int (** Returns the position where the matched part begins *) +val match_end : result -> int +(** Returns the position where the matched part ends *) + +val matched_group : result -> int -> string -> string +(** Extracts the substring the nth group matches from the whole + * string. The string argument + * must be the same string passed to [string_match] or the search + * functions, and the result argument must be the corresponding + * result. + *) + val full_split : regexp -> string -> split_result list (** Like [split_delim], but returns the delimiters in the result *) diff --git a/ocamlnet_lite/netstring_tstring.ml b/ocamlnet_lite/netstring_tstring.ml new file mode 100644 index 0000000..be4c8d9 --- /dev/null +++ b/ocamlnet_lite/netstring_tstring.ml @@ -0,0 +1,171 @@ +open Netsys_types + +type _ tstring_kind = + | String_kind : string tstring_kind + | Bytes_kind : Bytes.t tstring_kind + +type 't tstring_ops = { + kind : 't tstring_kind option; + length : 't -> int; + get : 't -> int -> char; + unsafe_get : 't -> int -> char; + unsafe_get3 : 't -> int -> int; (* get 3 chars packed into one int *) + copy : 't -> 't; + string : 't -> string; + bytes : 't -> Bytes.t; + sub : 't -> int -> int -> 't; + substring : 't -> int -> int -> string; + subbytes : 't -> int -> int -> Bytes.t; + subpoly : 'u. 'u tstring_kind -> 't -> int -> int -> 'u; + blit_to_bytes : 't -> int -> Bytes.t -> int -> int -> unit; + index_from : 't -> int -> char -> int; + index_from3 : 't -> int -> int -> char -> char -> char -> int; + rindex_from : 't -> int -> char -> int; + rindex_from3 : 't -> int -> int -> char -> char -> char -> int; +} + +type tstring_ops_box = + | Tstring_ops_box : 't tstring_kind * 't tstring_ops -> tstring_ops_box + +type tstring_box = + | Tstring_box : 't tstring_kind * 't tstring_ops * 't -> tstring_box + +type tstring_polybox = + | Tstring_polybox : 't tstring_ops * 't -> tstring_polybox +(* Warning: you cannot match on the type 't here *) + +let str_subpoly : type u. u tstring_kind -> string -> int -> int -> u = function + | String_kind -> String.sub + | Bytes_kind -> + fun s pos len -> + let b = Bytes.create len in + Bytes.blit_string s pos b 0 len; + b + +let str_index_from3 s p n c1 c2 c3 = + (* FIXME: implement in C *) + let sn = String.length s in + if n < 0 || p < 0 || p > sn - n then invalid_arg "index_from3"; + let lim = p + n in + let p = ref p in + while + !p < lim + && + let c = String.unsafe_get s !p in + c <> c1 && c <> c2 && c <> c3 + do + incr p + done; + if !p >= lim then raise Not_found; + !p + +let str_rindex_from3 s p n c1 c2 c3 = + (* FIXME: implement in C *) + let sn = String.length s in + if n < 0 || p < -1 || p >= sn || n - 1 > p then invalid_arg "rindex_from"; + let lim = p - n + 1 in + let p = ref p in + while + !p >= lim + && + let c = String.unsafe_get s !p in + c <> c1 && c <> c2 && c <> c3 + do + decr p + done; + if !p < lim then raise Not_found; + !p + +let string_ops = + { + kind = Some String_kind; + length = String.length; + get = String.get; + unsafe_get = String.unsafe_get; + unsafe_get3 = + (fun s k -> + let c0 = Char.code (String.unsafe_get s k) in + let c1 = Char.code (String.unsafe_get s (k + 1)) in + let c2 = Char.code (String.unsafe_get s (k + 2)) in + (c0 lsl 16) lor (c1 lsl 8) lor c2); + copy = (fun s -> s); + (* ... for the time being ... *) + string = (fun s -> s); + bytes = Bytes.of_string; + sub = String.sub; + substring = String.sub; + subbytes = + (fun s p l -> + let b = Bytes.create l in + Bytes.blit_string s p b 0 l; + b); + subpoly = str_subpoly; + blit_to_bytes = Bytes.blit_string; + index_from = String.index_from; + index_from3 = str_index_from3; + rindex_from = String.rindex_from; + rindex_from3 = str_rindex_from3; + } + +let bytes_index_from3 s p n c1 c2 c3 = + str_index_from3 (Bytes.unsafe_to_string s) p n c1 c2 c3 + +let bytes_rindex_from3 s p n c1 c2 c3 = + str_rindex_from3 (Bytes.unsafe_to_string s) p n c1 c2 c3 + +let bytes_subpoly : type u. u tstring_kind -> Bytes.t -> int -> int -> u = + function + | String_kind -> Bytes.sub_string + | Bytes_kind -> Bytes.sub + +let bytes_ops = + { + kind = Some Bytes_kind; + length = Bytes.length; + get = Bytes.get; + unsafe_get = Bytes.unsafe_get; + unsafe_get3 = + (fun s k -> + let c0 = Char.code (Bytes.unsafe_get s k) in + let c1 = Char.code (Bytes.unsafe_get s (k + 1)) in + let c2 = Char.code (Bytes.unsafe_get s (k + 2)) in + (c0 lsl 16) lor (c1 lsl 8) lor c2); + copy = Bytes.copy; + string = Bytes.to_string; + bytes = (fun s -> s); + sub = Bytes.sub; + substring = Bytes.sub_string; + subbytes = Bytes.sub; + subpoly = bytes_subpoly; + blit_to_bytes = Bytes.blit; + index_from = Bytes.index_from; + index_from3 = bytes_index_from3; + rindex_from = Bytes.rindex_from; + rindex_from3 = bytes_rindex_from3; + } + +let ops_of_tstring = function + | `String _ -> Tstring_ops_box (String_kind, string_ops) + | `Bytes _ -> Tstring_ops_box (Bytes_kind, bytes_ops) + +type 'a with_fun = { with_fun : 's. 's tstring_ops -> 's -> 'a } + +let with_tstring : 'a with_fun -> tstring -> 'a = + fun f -> function + | `String s -> f.with_fun string_ops s + | `Bytes s -> f.with_fun bytes_ops s + +let length_tstring ts = + with_tstring { with_fun = (fun ops s -> ops.length s) } ts + +let polymorph_string_transformation : + type s t. (string -> string) -> s tstring_ops -> t tstring_kind -> s -> t = + fun f ops out_kind s -> + let s' = f (ops.string s) in + match out_kind with + | String_kind -> s' + | Bytes_kind -> Bytes.of_string s' + +let tstring_of_tbuffer = function + | `Bytes s -> `Bytes s + | `String s -> `Bytes s diff --git a/ocamlnet_lite/netstring_tstring.mli b/ocamlnet_lite/netstring_tstring.mli new file mode 100644 index 0000000..0e4075d --- /dev/null +++ b/ocamlnet_lite/netstring_tstring.mli @@ -0,0 +1,82 @@ +(** Support module for tagged strings *) + +open Netsys_types + +(** GADT for encoding the string type (string/bytes/bigarray) *) +type _ tstring_kind = + | String_kind : string tstring_kind + | Bytes_kind : Bytes.t tstring_kind + +type 't tstring_ops = { + kind : 't tstring_kind option; + length : 't -> int; + get : 't -> int -> char; + unsafe_get : 't -> int -> char; + unsafe_get3 : 't -> int -> int; + (** get 3 chars packed into one int + (first char shifted by 16 bits, second char shifted by 8 bits, + third char unshifted) *) + copy : 't -> 't; + string : 't -> string; (** if possible this function does not make a copy *) + bytes : 't -> Bytes.t; (** if possible this function does not make a copy *) + sub : 't -> int -> int -> 't; + substring : 't -> int -> int -> string; + subbytes : 't -> int -> int -> Bytes.t; + subpoly : 'u. 'u tstring_kind -> 't -> int -> int -> 'u; + blit_to_bytes : 't -> int -> Bytes.t -> int -> int -> unit; + index_from : 't -> int -> char -> int; + index_from3 : 't -> int -> int -> char -> char -> char -> int; + (** finds any of three chars. The second int is the search radius *) + rindex_from : 't -> int -> char -> int; + rindex_from3 : 't -> int -> int -> char -> char -> char -> int; + (** finds any of three chars. The second int is the search radius *) +} +(** Operations to call on strings *) + +(** GADT for hiding the type parameter *) +type tstring_ops_box = + | Tstring_ops_box : 't tstring_kind * 't tstring_ops -> tstring_ops_box + +(** GADT for hiding the type parameter *) +type tstring_box = + | Tstring_box : 't tstring_kind * 't tstring_ops * 't -> tstring_box + +(** GADT for hiding the type parameter. Warning: This GADT does not permit you + to recover the kind of string + *) +type tstring_polybox = + | Tstring_polybox : 't tstring_ops * 't -> tstring_polybox + +val string_ops : string tstring_ops +(** Implementation of the operations for [string] *) + +val bytes_ops : Bytes.t tstring_ops +(** Implementation of the operations for [bytes] *) + +val ops_of_tstring : tstring -> tstring_ops_box +(** Create a [Tstring_ops_box] *) + +type 'a with_fun = { with_fun : 's. 's tstring_ops -> 's -> 'a } +(** A polymorphic function for strings *) + +val with_tstring : 'a with_fun -> tstring -> 'a +(** [with_tstring f ts]: Calls [f.with_fun] with the right implementation of + the [tstring_ops] argument + *) + +val length_tstring : tstring -> int +(** Get the length of a tagged string *) + +val tstring_of_tbuffer : tbuffer -> tstring +(** Get the tagged string of a tagged buffer *) + +val polymorph_string_transformation : + (string -> string) -> 's tstring_ops -> 't tstring_kind -> 's -> 't +(** [polymorph_string_transformation f ops kind s]: Converts [s] to a + string, runs [f] on this string, and converts the result to the + type demanded by [kind] + *) + +(**/**) + +val bytes_subpoly : 'u tstring_kind -> Bytes.t -> int -> int -> 'u diff --git a/ocamlnet_lite/netsys_types.ml b/ocamlnet_lite/netsys_types.ml new file mode 100644 index 0000000..19c76b3 --- /dev/null +++ b/ocamlnet_lite/netsys_types.ml @@ -0,0 +1,32 @@ +(* WARNING! THIS IS A COPY OF NETSYS_TYPES.MLI! *) + +(** Types for all Netsys modules *) + +(** {2 Bytes and characters} *) + +(** Remember that up to OCaml-4.01 there was only the [string] type, + and strings were mutable (although frequently used as if there were + immutable). Since OCaml-4.02 there is the immutable [string] and + the mutable [bytes] type. + + The general strategy for switching to the string/bytes scheme is + to replace [string] everywhere with [bytes], and to provide + additional functions taking strings as input or output where it + makes sense. There are exceptions, though, e.g. when the string + acts as a key in a data structure. + + The type name "string" also occurs in function names (e.g. + "get_string") and in variant names (e.g. [String_case]). As we + want to be backward compatible, we keep the old names for functions + on [bytes], and mark them as deprecated. + *) + +type tbuffer = [ `Bytes of Bytes.t | `String of Bytes.t ] + (** A tagged buffer. Note that the [`String] case is deprecated, and only + provided for backward compatibility. + *) + +type tstring = [ `Bytes of Bytes.t | `String of string ] + (** A tagged string which is considered as immutable. See also the + support module {!Netstring_tstring}. + *) diff --git a/ocamlnet_lite/netsys_types.mli b/ocamlnet_lite/netsys_types.mli new file mode 100644 index 0000000..d3faf1a --- /dev/null +++ b/ocamlnet_lite/netsys_types.mli @@ -0,0 +1,32 @@ +(* $Id$ *) + +(** Types for all Netsys modules *) + +(** {2 Bytes and characters} *) + +(** Remember that up to OCaml-4.01 there was only the [string] type, + and strings were mutable (although frequently used as if there were + immutable). Since OCaml-4.02 there is the immutable [string] and + the mutable [bytes] type. + + The general strategy for switching to the string/bytes scheme is + to replace [string] everywhere with [bytes], and to provide + additional functions taking strings as input or output where it + makes sense. There are exceptions, though, e.g. when the string + acts as a key in a data structure. + + The type name "string" also occurs in function names (e.g. + "get_string") and in variant names (e.g. [String_case]). As we + want to be backward compatible, we keep the old names for functions + on [bytes], and mark them as deprecated. + *) + +type tbuffer = [ `Bytes of Bytes.t | `String of Bytes.t ] + (** A tagged buffer. Note that the [`String] case is deprecated, and only + provided for backward compatibility. + *) + +type tstring = [ `Bytes of Bytes.t | `String of string ] + (** A tagged string which is considered as immutable. See also the + support module {!Netstring_tstring}. + *) diff --git a/ocamlnet_lite/url.ml b/ocamlnet_lite/url.ml deleted file mode 100644 index aea3b81..0000000 --- a/ocamlnet_lite/url.ml +++ /dev/null @@ -1,100 +0,0 @@ -(* adapted from https://gitlab.com/gerdstolpmann/lib-ocamlnet3/-/blob/4d1a8401bd40c17632128545e2aa4c880535e208/code/src/netstring/netencoding.ml#L993 *) -let hex_digits = - [| - '0'; - '1'; - '2'; - '3'; - '4'; - '5'; - '6'; - '7'; - '8'; - '9'; - 'A'; - 'B'; - 'C'; - 'D'; - 'E'; - 'F'; - |] - -let to_hex2 k = - (* Converts k to a 2-digit hex string *) - let s = Bytes.create 2 in - Bytes.set s 0 hex_digits.((k lsr 4) land 15); - Bytes.set s 1 hex_digits.(k land 15); - Bytes.unsafe_to_string s - -let of_hex1 c = - match c with - | '0' .. '9' -> Char.code c - Char.code '0' - | 'A' .. 'F' -> Char.code c - Char.code 'A' + 10 - | 'a' .. 'f' -> Char.code c - Char.code 'a' + 10 - | _ -> raise Not_found - -let url_encoding_re = Netstring_str.regexp "[^A-Za-z0-9_.!*-]" -let url_decoding_re = Netstring_str.regexp "\\+\\|%..\\|%.\\|%" - -let encode ?(plus = true) s = - Netstring_str.global_substitute url_encoding_re - (fun r _ -> - match Netstring_str.matched_string r s with - | " " when plus -> "+" - | x -> - let k = Char.code x.[0] in - "%" ^ to_hex2 k) - s - -let decode ?(plus = true) ?(pos = 0) ?len s = - let s_l = String.length s in - let s1 = - if pos = 0 && len = None then s - else - let len = match len with Some n -> n | None -> s_l in - String.sub s pos len - in - let l = String.length s1 in - Netstring_str.global_substitute url_decoding_re - (fun r _ -> - match Netstring_str.matched_string r s1 with - | "+" -> if plus then " " else "+" - | _ -> ( - let i = Netstring_str.match_beginning r in - (* Assertion: s1.[i] = '%' *) - if i + 2 >= l then failwith "Web.Url.decode"; - let c1 = s1.[i + 1] in - let c2 = s1.[i + 2] in - try - let k1 = of_hex1 c1 in - let k2 = of_hex1 c2 in - String.make 1 (Char.chr ((k1 lsl 4) lor k2)) - with Not_found -> failwith "Web.Url.decode")) - s1 - -let url_split_re = Netstring_str.regexp "[&=]" - -let dest_url_encoded_parameters parstr = - let rec parse_after_amp tl = - match tl with - | Netstring_str.Text name - :: Netstring_str.Delim "=" - :: Netstring_str.Text value - :: tl' -> - (decode name, decode value) :: parse_next tl' - | Netstring_str.Text name - :: Netstring_str.Delim "=" - :: Netstring_str.Delim "&" - :: tl' -> - (decode name, "") :: parse_after_amp tl' - | [ Netstring_str.Text name; Netstring_str.Delim "=" ] -> - [ (decode name, "") ] - | _ -> failwith "Web.Url.dest_url_encoded_parameters" - and parse_next tl = - match tl with - | [] -> [] - | Netstring_str.Delim "&" :: tl' -> parse_after_amp tl' - | _ -> failwith "Web.Url.dest_url_encoded_parameters" - in - let toklist = Netstring_str.full_split url_split_re parstr in - match toklist with [] -> [] | _ -> parse_after_amp toklist diff --git a/ocamlnet_lite/url.mli b/ocamlnet_lite/url.mli deleted file mode 100644 index 980f914..0000000 --- a/ocamlnet_lite/url.mli +++ /dev/null @@ -1,39 +0,0 @@ -(** Encoding/Decoding within URLs: - * - * The following two functions perform the '%'-substitution for - * characters that may otherwise be interpreted as metacharacters. - * - * According to: RFC 1738, RFC 1630 - * - * Option [plus]: This option has been added because there are some - * implementations that do not map ' ' to '+', for example Javascript's - * [escape] function. The default is [true] because this is the RFC- - * compliant definition. - *) - -(** There are no tstring and polymorphic versions of the encode and - decode functions, as URLs are comparatively short, and it is - considered as acceptable for the user to convert types as needed, - even if strings need to be copied for that. -*) - -val decode : ?plus:bool -> ?pos:int -> ?len:int -> string -> string -(** Option [plus]: Whether '+' is converted to space. The default - * is true. If false, '+' is returned as it is. - * - * The optional arguments [pos] and [len] may restrict the string - * to process to this substring. - *) - -val encode : ?plus:bool -> string -> string -(** Option [plus]: Whether spaces are converted to '+'. The default - * is true. If false, spaces are converted to "%20", and - * only %xx sequences are produced. - *) - -val dest_url_encoded_parameters : string -> (string * string) list -(** The argument is the URL-encoded parameter string. The result is - * the corresponding list of (name,value) pairs. - * Note: Whitespace within the parameter string is ignored. - * If there is a format error, the function fails. - *) diff --git a/web.ml b/web.ml index 21422e9..946c39d 100644 --- a/web.ml +++ b/web.ml @@ -9,16 +9,16 @@ open Ocamlnet_lite let log = Log.self (** percent-encode (convert space into %20) *) -let rawurlencode = Url.encode ~plus:false +let rawurlencode = Netencoding.Url.encode ~plus:false (** percent-encode, but convert space into plus, not %20 *) -let urlencode = Url.encode ~plus:true +let urlencode = Netencoding.Url.encode ~plus:true (** percent-decode (leave plus as is) *) -let rawurldecode s = try Url.decode ~plus:false s with _ -> s +let rawurldecode s = try Netencoding.Url.decode ~plus:false s with _ -> s (** percent-decode and convert plus into space *) -let urldecode s = try Url.decode ~plus:true s with _ -> s +let urldecode s = try Netencoding.Url.decode ~plus:true s with _ -> s let htmlencode = Netencoding.Html.encode ~in_enc:`Enc_utf8 ~out_enc:`Enc_utf8 () let htmldecode_exn = Netencoding.Html.decode ~in_enc:`Enc_utf8 ~out_enc:`Enc_utf8 () From 38f262fcc8ba76a2dc84ecd74e1b83133fb933fd Mon Sep 17 00:00:00 2001 From: Javier Chavarri Date: Tue, 12 Dec 2023 12:07:24 +0000 Subject: [PATCH 4/8] fix tests --- dune | 2 +- test.ml | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/dune b/dune index 27d8bc1..e222719 100644 --- a/dune +++ b/dune @@ -61,7 +61,7 @@ (executable (name test) - (libraries devkit extlib extunix libevent ounit2 yojson) + (libraries devkit extlib extunix libevent ocamlnet_lite ounit2 unix yojson) (modules test test_httpev)) (rule diff --git a/test.ml b/test.ml index 615415d..3552598 100644 --- a/test.ml +++ b/test.ml @@ -1,6 +1,7 @@ open OUnit open Printf open ExtLib +open Ocamlnet_lite module U = ExtUnix.Specific From 8db9a77b15ee80d7755666ee3ec84a89e43d2887 Mon Sep 17 00:00:00 2001 From: Javier Chavarri Date: Tue, 12 Dec 2023 13:51:02 +0000 Subject: [PATCH 5/8] add tests for url/html encoding/decoding --- test.ml | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/test.ml b/test.ml index 3552598..6518423 100644 --- a/test.ml +++ b/test.ml @@ -515,6 +515,22 @@ let () = test "bit_struct_list" @@ fun () -> t [ 0; 1; 2; 3; 4; 5; 6; 7 ]; () +let () = test "Web.htmldecode" @@ fun () -> + assert_equal (Web.htmldecode "A <p> tag & a <div> tag.") "A

tag & a

tag."; + () + +let () = test "Web.htmlencode" @@ fun () -> + assert_equal (Web.htmlencode "A

tag & a

tag.") "A <p> tag & a <div> tag."; + () + +let () = test "Web.urldecode" @@ fun () -> + assert_equal (Web.urldecode "Hello+G%C3%BCnter") "Hello Günter"; + () + +let () = test "Web.urlencode" @@ fun () -> + assert_equal (Web.urlencode "Hello Günter") "Hello+G%C3%BCnter"; + () + let tests () = let (_:test_results) = run_test_tt_main ("devkit" >::: List.rev !tests) in () From 17e7d39bbb269c86500b9dbbd2441ac0f2c604da Mon Sep 17 00:00:00 2001 From: Javier Chavarri Date: Tue, 12 Dec 2023 14:11:14 +0000 Subject: [PATCH 6/8] simplify netconversion --- ocamlnet_lite/netconversion.ml | 1355 +------------------------------ ocamlnet_lite/netconversion.mli | 131 +-- ocamlnet_lite/netencoding.ml | 148 +--- 3 files changed, 30 insertions(+), 1604 deletions(-) diff --git a/ocamlnet_lite/netconversion.ml b/ocamlnet_lite/netconversion.ml index c07e318..fe42963 100644 --- a/ocamlnet_lite/netconversion.ml +++ b/ocamlnet_lite/netconversion.ml @@ -18,383 +18,8 @@ let big_slice = (* 3 *) 250 type encoding = [ `Enc_utf8 (* UTF-8 *) - | `Enc_utf8_opt_bom - | `Enc_java - | `Enc_utf16 (* UTF-16 with unspecified endianess (restricted usage) *) - | `Enc_utf16_le (* UTF-16 little endian *) - | `Enc_utf16_be (* UTF-16 big endian *) - | `Enc_utf32 (* UTF-32 with unspecified endianess (restricted usage) *) - | `Enc_utf32_le (* UTF-32 little endian *) - | `Enc_utf32_be (* UTF-32 big endian *) - | `Enc_usascii (* US-ASCII (only 7 bit) *) - | `Enc_iso88591 (* ISO-8859-1 *) - | `Enc_iso88592 (* ISO-8859-2 *) - | `Enc_iso88593 (* ISO-8859-3 *) - | `Enc_iso88594 (* ISO-8859-4 *) - | `Enc_iso88595 (* ISO-8859-5 *) - | `Enc_iso88596 (* ISO-8859-6 *) - | `Enc_iso88597 (* ISO-8859-7 *) - | `Enc_iso88598 (* ISO-8859-8 *) - | `Enc_iso88599 (* ISO-8859-9 *) - | `Enc_iso885910 (* ISO-8859-10 *) - | `Enc_iso885911 (* ISO-8859-11 *) - | `Enc_iso885913 (* ISO-8859-13 *) - | `Enc_iso885914 (* ISO-8859-14 *) - | `Enc_iso885915 (* ISO-8859-15 *) - | `Enc_iso885916 (* ISO-8859-16 *) - | `Enc_koi8r - (* KOI8-R *) - (* http://koi8.pp.ru *) - | (*| `Enc_koi8u (* KOI8-U *) (* http://www.net.ua/KOI8-U/index.html *)*) - `Enc_jis0201 - (* JIS-X-0201 *) - | (* - | `Enc_jis0201_roman (* JIS-X-0201 only roman half *) - | `Enc_jis0201_kana (* JIS-X-0201 katakana half remapped to 0x21..XXX *) - | `Enc_jis0208_94x94 (* JIS-X-0208 in ISO-2022-style two byte encoding *) - | `Enc_jis0212_94x94 (* JIS-X-0212 in ISO-2022-style two byte encoding *) - *) - `Enc_eucjp - (* EUC-JP *) - | `Enc_euckr (* EUC-KR *) - | (* - | `Enc_iso2022 of iso2022_state - | `Enc_iso2022jp of iso2022jp_state - *) - (* Older standards: *) - `Enc_asn1_iso646 - (* only the language-neutral subset *) - | `Enc_asn1_T61 (* ITU T.61 ("Teletex") *) - | `Enc_asn1_printable - | (* Microsoft: *) - `Enc_windows1250 (* WINDOWS-1250 *) - | `Enc_windows1251 (* WINDOWS-1251 *) - | `Enc_windows1252 (* WINDOWS-1252 *) - | `Enc_windows1253 (* WINDOWS-1253 *) - | `Enc_windows1254 (* WINDOWS-1254 *) - | `Enc_windows1255 (* WINDOWS-1255 *) - | `Enc_windows1256 (* WINDOWS-1256 *) - | `Enc_windows1257 (* WINDOWS-1257 *) - | `Enc_windows1258 (* WINDOWS-1258 *) - | (* IBM, ASCII-based: *) - `Enc_cp437 - | `Enc_cp737 - | `Enc_cp775 - | `Enc_cp850 - | `Enc_cp852 - | `Enc_cp855 - | `Enc_cp856 - | `Enc_cp857 - | `Enc_cp860 - | `Enc_cp861 - | `Enc_cp862 - | `Enc_cp863 - | `Enc_cp864 - | `Enc_cp865 - | `Enc_cp866 (* Russian *) - | `Enc_cp869 - | `Enc_cp874 - | `Enc_cp1006 - | (* IBM, EBCDIC-based: *) - `Enc_cp037 - (* EBCDIC USA Canada *) - (* 273: EBCDIC Germany, Austria, - * 277: Denmark, Norway, - * 278: Finland, Sweden, - * 280: Italy, - * 284: Spain, Latin America, - * 285: United Kingdom, - * 297: France, - * 871: Iceland, - *) - | `Enc_cp424 - | `Enc_cp500 (* EBCDIC International *) - | `Enc_cp875 (* EBCDIC Modern Greek *) - | `Enc_cp1026 (* EBCDIC Turkish *) - | `Enc_cp1047 (* EBCDIC Latin1, OS 390 System Services *) - | (* Adobe: *) - `Enc_adobe_standard_encoding - | `Enc_adobe_symbol_encoding - | `Enc_adobe_zapf_dingbats_encoding - | (* Apple: *) - `Enc_macroman | (* Encoding subset: *) - `Enc_subset of encoding * (int -> bool) - | `Enc_empty ] - -type charset = - [ `Set_unicode (* The full Unicode repertoire *) - | `Set_usascii (* US-ASCII (only 7 bit) *) - | `Set_iso88591 (* ISO-8859-1 *) - | `Set_iso88592 (* ISO-8859-2 *) - | `Set_iso88593 (* ISO-8859-3 *) - | `Set_iso88594 (* ISO-8859-4 *) - | `Set_iso88595 (* ISO-8859-5 *) - | `Set_iso88596 (* ISO-8859-6 *) - | `Set_iso88597 (* ISO-8859-7 *) - | `Set_iso88598 (* ISO-8859-8 *) - | `Set_iso88599 (* ISO-8859-9 *) - | `Set_iso885910 (* ISO-8859-10 *) - | `Set_iso885911 (* ISO-8859-11 *) - | `Set_iso885913 (* ISO-8859-13 *) - | `Set_iso885914 (* ISO-8859-14 *) - | `Set_iso885915 (* ISO-8859-15 *) - | `Set_iso885916 (* ISO-8859-16 *) - | `Set_koi8r (* KOI8-R *) - | `Set_jis0201 (* JIS-X-0201 *) - | `Set_jis0208 (* JIS-X-0208 *) - | `Set_jis0212 (* JIS-X-0212 *) - | `Set_ks1001 (* KS-X-1001 *) - | `Set_asn1_iso646 - | `Set_asn1_T61 - | `Set_asn1_printable - | (* Microsoft: *) - `Set_windows1250 (* WINDOWS-1250 *) - | `Set_windows1251 (* WINDOWS-1251 *) - | `Set_windows1252 (* WINDOWS-1252 *) - | `Set_windows1253 (* WINDOWS-1253 *) - | `Set_windows1254 (* WINDOWS-1254 *) - | `Set_windows1255 (* WINDOWS-1255 *) - | `Set_windows1256 (* WINDOWS-1256 *) - | `Set_windows1257 (* WINDOWS-1257 *) - | `Set_windows1258 (* WINDOWS-1258 *) - | (* IBM, ASCII-based: *) - `Set_cp437 - | `Set_cp737 - | `Set_cp775 - | `Set_cp850 - | `Set_cp852 - | `Set_cp855 - | `Set_cp856 - | `Set_cp857 - | `Set_cp860 - | `Set_cp861 - | `Set_cp862 - | `Set_cp863 - | `Set_cp864 - | `Set_cp865 - | `Set_cp866 - | `Set_cp869 - | `Set_cp874 - | `Set_cp1006 - | (* IBM, EBCDIC-based: *) - `Set_cp037 - | `Set_cp424 - | `Set_cp500 - | `Set_cp875 - | `Set_cp1026 - | `Set_cp1047 - | (* Adobe: *) - `Set_adobe_standard_encoding - | `Set_adobe_symbol_encoding - | `Set_adobe_zapf_dingbats_encoding - | (* Apple: *) - `Set_macroman ] - -let ascii_compat_encodings = - [ - `Enc_utf8; - `Enc_utf8_opt_bom; - `Enc_java; - `Enc_usascii; - `Enc_iso88591; - `Enc_iso88592; - `Enc_iso88593; - `Enc_iso88594; - `Enc_iso88595; - `Enc_iso88596; - `Enc_iso88597; - `Enc_iso88598; - `Enc_iso88599; - `Enc_iso885910; - `Enc_iso885911; - `Enc_iso885913; - `Enc_iso885914; - `Enc_iso885915; - `Enc_iso885916; - `Enc_koi8r; - `Enc_windows1250; - `Enc_windows1251; - `Enc_windows1252; - `Enc_windows1253; - `Enc_windows1254; - `Enc_windows1255; - `Enc_windows1256; - `Enc_windows1257; - `Enc_windows1258; - `Enc_cp437; - `Enc_cp737; - `Enc_cp775; - `Enc_cp850; - `Enc_cp852; - `Enc_cp855; - `Enc_cp856; - `Enc_cp857; - `Enc_cp860; - `Enc_cp861; - `Enc_cp862; - `Enc_cp863; - `Enc_cp864; - `Enc_cp865; - `Enc_cp866; - `Enc_cp869; - `Enc_cp874; - `Enc_cp1006; - `Enc_eucjp; - `Enc_euckr; - `Enc_macroman; - ] - -let rec is_ascii_compatible = function - | `Enc_subset (e, _) -> is_ascii_compatible e - | e -> List.mem e ascii_compat_encodings - -let rec is_single_byte = function - | `Enc_utf8 | `Enc_utf8_opt_bom | `Enc_java | `Enc_utf16 | `Enc_utf16_le - | `Enc_utf16_be | `Enc_utf32 | `Enc_utf32_le | `Enc_utf32_be -> - false - | `Enc_eucjp -> false - | `Enc_euckr -> false - | `Enc_subset (e, _) -> is_single_byte e - | _ -> true - -let internal_name (cs : charset) = - (* The name used for netdb lookups *) - match cs with - | `Set_unicode -> "unicode" - | `Set_usascii -> "usascii" - | `Set_iso88591 -> "iso88591" - | `Set_iso88592 -> "iso88592" - | `Set_iso88593 -> "iso88593" - | `Set_iso88594 -> "iso88594" - | `Set_iso88595 -> "iso88595" - | `Set_iso88596 -> "iso88596" - | `Set_iso88597 -> "iso88597" - | `Set_iso88598 -> "iso88598" - | `Set_iso88599 -> "iso88599" - | `Set_iso885910 -> "iso885910" - | `Set_iso885911 -> "iso885911" - | `Set_iso885913 -> "iso885913" - | `Set_iso885914 -> "iso885914" - | `Set_iso885915 -> "iso885915" - | `Set_iso885916 -> "iso885916" - | `Set_koi8r -> "koi8r" - | `Set_jis0201 -> "jis0201" - | `Set_jis0208 -> "jis0208" - | `Set_jis0212 -> "jis0212" - | `Set_ks1001 -> "ks1001" - | `Set_asn1_iso646 -> "asn1_iso646" - | `Set_asn1_T61 -> "asn1_t61" - | `Set_asn1_printable -> "asn1_printable" - | `Set_windows1250 -> "windows1250" - | `Set_windows1251 -> "windows1251" - | `Set_windows1252 -> "windows1252" - | `Set_windows1253 -> "windows1253" - | `Set_windows1254 -> "windows1254" - | `Set_windows1255 -> "windows1255" - | `Set_windows1256 -> "windows1256" - | `Set_windows1257 -> "windows1257" - | `Set_windows1258 -> "windows1258" - | `Set_cp437 -> "cp437" - | `Set_cp737 -> "cp737" - | `Set_cp775 -> "cp775" - | `Set_cp850 -> "cp850" - | `Set_cp852 -> "cp852" - | `Set_cp855 -> "cp855" - | `Set_cp856 -> "cp856" - | `Set_cp857 -> "cp857" - | `Set_cp860 -> "cp860" - | `Set_cp861 -> "cp861" - | `Set_cp862 -> "cp862" - | `Set_cp863 -> "cp863" - | `Set_cp864 -> "cp864" - | `Set_cp865 -> "cp865" - | `Set_cp866 -> "cp866" - | `Set_cp869 -> "cp869" - | `Set_cp874 -> "cp874" - | `Set_cp1006 -> "cp1006" - | `Set_cp037 -> "cp037" - | `Set_cp424 -> "cp424" - | `Set_cp500 -> "cp500" - | `Set_cp875 -> "cp875" - | `Set_cp1026 -> "cp1026" - | `Set_cp1047 -> "cp1047" - | `Set_adobe_standard_encoding -> "adobe_standard_encoding" - | `Set_adobe_symbol_encoding -> "adobe_symbol_encoding" - | `Set_adobe_zapf_dingbats_encoding -> "adobe_zapf_dingbats_encoding" - | `Set_macroman -> "macroman" - -let rec required_charsets (e : encoding) = - (* The name is a bit misleading. The function returns the charsets that - * correspond to the conversion tables that are required to support the - * encoding. - *) - match e with - | `Enc_utf8 | `Enc_utf8_opt_bom | `Enc_java | `Enc_utf16 | `Enc_utf16_le - | `Enc_utf16_be | `Enc_utf32 | `Enc_utf32_le | `Enc_utf32_be -> - [] - | `Enc_usascii -> [] - | `Enc_iso88591 -> [] - | `Enc_iso88592 -> [ `Set_iso88592 ] - | `Enc_iso88593 -> [ `Set_iso88593 ] - | `Enc_iso88594 -> [ `Set_iso88594 ] - | `Enc_iso88595 -> [ `Set_iso88595 ] - | `Enc_iso88596 -> [ `Set_iso88596 ] - | `Enc_iso88597 -> [ `Set_iso88597 ] - | `Enc_iso88598 -> [ `Set_iso88598 ] - | `Enc_iso88599 -> [ `Set_iso88599 ] - | `Enc_iso885910 -> [ `Set_iso885910 ] - | `Enc_iso885911 -> [ `Set_iso885911 ] - | `Enc_iso885913 -> [ `Set_iso885913 ] - | `Enc_iso885914 -> [ `Set_iso885914 ] - | `Enc_iso885915 -> [ `Set_iso885915 ] - | `Enc_iso885916 -> [ `Set_iso885916 ] - | `Enc_koi8r -> [ `Set_koi8r ] - | `Enc_jis0201 -> [ `Set_jis0201 ] - | `Enc_eucjp -> [ `Set_jis0201; `Set_jis0208; `Set_jis0212 ] - | `Enc_euckr -> [ `Set_ks1001 ] - | `Enc_asn1_iso646 -> [ `Set_asn1_iso646 ] - | `Enc_asn1_T61 -> [ `Set_asn1_T61 ] - | `Enc_asn1_printable -> [ `Set_asn1_printable ] - | `Enc_windows1250 -> [ `Set_windows1250 ] - | `Enc_windows1251 -> [ `Set_windows1251 ] - | `Enc_windows1252 -> [ `Set_windows1252 ] - | `Enc_windows1253 -> [ `Set_windows1253 ] - | `Enc_windows1254 -> [ `Set_windows1254 ] - | `Enc_windows1255 -> [ `Set_windows1255 ] - | `Enc_windows1256 -> [ `Set_windows1256 ] - | `Enc_windows1257 -> [ `Set_windows1257 ] - | `Enc_windows1258 -> [ `Set_windows1258 ] - | `Enc_cp437 -> [ `Set_cp437 ] - | `Enc_cp737 -> [ `Set_cp737 ] - | `Enc_cp775 -> [ `Set_cp775 ] - | `Enc_cp850 -> [ `Set_cp850 ] - | `Enc_cp852 -> [ `Set_cp852 ] - | `Enc_cp855 -> [ `Set_cp855 ] - | `Enc_cp856 -> [ `Set_cp856 ] - | `Enc_cp857 -> [ `Set_cp857 ] - | `Enc_cp860 -> [ `Set_cp860 ] - | `Enc_cp861 -> [ `Set_cp861 ] - | `Enc_cp862 -> [ `Set_cp862 ] - | `Enc_cp863 -> [ `Set_cp863 ] - | `Enc_cp864 -> [ `Set_cp864 ] - | `Enc_cp865 -> [ `Set_cp865 ] - | `Enc_cp866 -> [ `Set_cp866 ] - | `Enc_cp869 -> [ `Set_cp869 ] - | `Enc_cp874 -> [ `Set_cp874 ] - | `Enc_cp1006 -> [ `Set_cp1006 ] - | `Enc_cp037 -> [ `Set_cp037 ] - | `Enc_cp424 -> [ `Set_cp424 ] - | `Enc_cp500 -> [ `Set_cp500 ] - | `Enc_cp875 -> [ `Set_cp875 ] - | `Enc_cp1026 -> [ `Set_cp1026 ] - | `Enc_cp1047 -> [ `Set_cp1047 ] - | `Enc_adobe_standard_encoding -> [ `Set_adobe_standard_encoding ] - | `Enc_adobe_symbol_encoding -> [ `Set_adobe_symbol_encoding ] - | `Enc_adobe_zapf_dingbats_encoding -> [ `Set_adobe_zapf_dingbats_encoding ] - | `Enc_macroman -> [ `Set_macroman ] - | `Enc_subset (e', _) -> required_charsets e' - | `Enc_empty -> [] + `Enc_subset of encoding * (int -> bool) ] (* Internal conversion interface: * @@ -455,14 +80,7 @@ let rec required_charsets (e : encoding) = * - The validity of the input encoding needs not to be checked *) -exception Malformed_code_read of (int * int * encoding);; - -(* not exported! *) - -Callback.register_exception "Netconversion.Malformed_code_read" - (Malformed_code_read (0, 0, `Enc_empty)) - -(* Needed by netaccel_c.c *) +exception Malformed_code_read of (int * int * encoding) (* UNSAFE_OPT: A number of functions have been optimized by using * unsafe features of O'Caml (unsafe_get, unsafe_set, unsafe_chr). @@ -483,104 +101,6 @@ type poly_reader = { int * int * encoding; } -let read_iso88591 maxcode enc = - (* UNSAFE_OPT *) - let read ops slice_char slice_blen s_in p_in l_in = - let open Netstring_tstring in - assert (Array.length slice_char = Array.length slice_blen); - assert (p_in >= 0 && p_in + l_in <= ops.length s_in && l_in >= 0); - let m = min l_in (Array.length slice_char) in - let m3 = m / 3 in - for k3 = 0 to m3 - 1 do - let k = 3 * k3 in - (* let ch = Char.code s_in.[ p_in + k ] in *) - let chars = ops.unsafe_get3 s_in (p_in + k) in - let c0 = chars lsr 16 in - let c1 = (chars lsr 8) land 0xff in - let c2 = chars land 0xff in - if c0 > maxcode then ( - slice_char.(k) <- -1; - raise (Malformed_code_read (k, k, enc))); - Array.unsafe_set slice_char k c0; - if c1 > maxcode then ( - slice_char.(k + 1) <- -1; - raise (Malformed_code_read (k + 1, k + 1, enc))); - Array.unsafe_set slice_char (k + 1) c1; - if c2 > maxcode then ( - slice_char.(k + 2) <- -1; - raise (Malformed_code_read (k + 2, k + 2, enc))); - Array.unsafe_set slice_char (k + 2) c2 - done; - for k = 3 * m3 to m - 1 do - let c0 = Char.code (ops.unsafe_get s_in (p_in + k)) in - if c0 > maxcode then ( - slice_char.(k) <- -1; - raise (Malformed_code_read (k, k, enc))); - Array.unsafe_set slice_char k c0 - done; - if m < Array.length slice_char then slice_char.(m) <- -1; - (m, m, enc) - in - { read } - -let read_iso88591_ref = ref read_iso88591 - -let get_8bit_to_unicode_map enc = - let cs = - match required_charsets enc with - | [ cs ] -> cs - | _ -> failwith "get_8bit_to_unicode_map" - in - let to_unicode = Netmappings.get_to_unicode (internal_name cs) in - assert (Array.length to_unicode = 256); - to_unicode - -let read_8bit enc = - let m_to_unicode = get_8bit_to_unicode_map enc in - - (* the 256-byte array mapping the character set to unicode *) - let read ops slice_char slice_blen s_in p_in l_in = - (* UNSAFE_OPT *) - let open Netstring_tstring in - assert (Array.length slice_char = Array.length slice_blen); - assert (p_in >= 0 && p_in + l_in <= ops.length s_in && l_in >= 0); - let m = min l_in (Array.length slice_char) in - let m3 = m / 3 in - for k3 = 0 to m3 - 1 do - let k = 3 * k3 in - let chars = ops.unsafe_get3 s_in k in - let c0 = chars lsr 16 in - let c1 = (chars lsr 8) land 0xff in - let c2 = chars land 0xff in - let c0_uni = Array.unsafe_get m_to_unicode c0 in - if c0_uni < 0 then ( - slice_char.(k) <- -1; - raise (Malformed_code_read (k, k, enc))); - Array.unsafe_set slice_char k c0_uni; - let c1_uni = Array.unsafe_get m_to_unicode c1 in - if c1_uni < 0 then ( - slice_char.(k + 1) <- -1; - raise (Malformed_code_read (k + 1, k + 1, enc))); - Array.unsafe_set slice_char (k + 1) c1_uni; - let c2_uni = Array.unsafe_get m_to_unicode c2 in - if c2_uni < 0 then ( - slice_char.(k + 2) <- -1; - raise (Malformed_code_read (k + 2, k + 2, enc))); - Array.unsafe_set slice_char (k + 2) c2_uni - done; - for k = 3 * m3 to m - 1 do - let c0 = Char.code (ops.get s_in k) in - let c0_uni = Array.unsafe_get m_to_unicode c0 in - if c0_uni < 0 then ( - slice_char.(k) <- -1; - raise (Malformed_code_read (k, k, enc))); - Array.unsafe_set slice_char k c0_uni - done; - if m < Array.length slice_char then slice_char.(m) <- -1; - (m, m, enc) - in - { read } - let read_utf8 is_java = (* UNSAFE_OPT *) let read ops slice_char slice_blen s_in p_in l_in = @@ -742,381 +262,6 @@ let read_utf8 is_java = let read_utf8_ref = ref read_utf8 -let have_utf8_bom ops s p = - let open Netstring_tstring in - let c0 = ops.get s (p + 0) in - let c1 = ops.get s (p + 1) in - let c2 = ops.get s (p + 2) in - c0 = '\xEF' && c1 = '\xBB' && c2 = '\xBF' - -let read_utf8_opt_bom expose_bom = - let read ops slice_char slice_blen s_in p_in l_in = - let open Netstring_tstring in - assert (Array.length slice_char = Array.length slice_blen); - assert (p_in >= 0 && p_in + l_in <= ops.length s_in && l_in >= 0); - (* Expect a BOM at the beginning of the text *) - if l_in >= 3 then - if have_utf8_bom ops s_in p_in then ( - let p_in1, l_in1 = - if expose_bom then (p_in, l_in) else (p_in + 3, l_in - 3) - in - let n_ret, p_ret, enc = - (!read_utf8_ref false).read ops slice_char slice_blen s_in p_in1 l_in1 - in - let p_ret1 = if expose_bom then p_ret else p_ret + 3 in - if expose_bom && n_ret >= 1 then slice_char.(0) <- -3; - (n_ret, p_ret1, enc)) - else (!read_utf8_ref false).read ops slice_char slice_blen s_in p_in l_in - else - let bom_possible = - l_in = 0 - || (l_in = 1 && ops.get s_in 0 = '\xEF') - || (l_in = 2 && ops.get s_in 0 = '\xEF' && ops.get s_in 1 = '\xBB') - in - if bom_possible then (0, 0, `Enc_utf8_opt_bom) - else (!read_utf8_ref false).read ops slice_char slice_blen s_in p_in l_in - in - { read } - -let surrogate_offset = 0x10000 - (0xD800 lsl 10) - 0xDC00 - -let read_utf16_lebe lo hi n_start enc = - (* lo=0, hi=1: little endian - * lo=1, hi=0: big endian - * n_start: First cell in slice to use - *) - let read ops slice_char slice_blen s_in p_in l_in = - let open Netstring_tstring in - assert (Array.length slice_char = Array.length slice_blen); - assert (p_in >= 0 && p_in + l_in <= ops.length s_in && l_in >= 0); - - let malformed_code k n = - slice_char.(n) <- -1; - raise (Malformed_code_read (n, k, enc)) - in - - (* k: counts the bytes - * n: counts the characters - *) - let rec put_loop k n = - if k + 1 < l_in && n < Array.length slice_char then - let p = - Char.code (ops.get s_in (p_in + k + lo)) - lor (Char.code (ops.get s_in (p_in + k + hi)) lsl 8) - in - - if p >= 0xd800 && p < 0xe000 then - (* This is a surrogate pair. *) - if k + 3 < l_in then - if p <= 0xdbff then ( - let q = - Char.code (ops.get s_in (p_in + k + 2 + lo)) - lor (Char.code (ops.get s_in (p_in + k + 2 + hi)) lsl 8) - in - if q < 0xdc00 || q > 0xdfff then malformed_code k n; - let eff_p = (p lsl 10) + q + surrogate_offset in - slice_char.(n) <- eff_p; - slice_blen.(n) <- 4; - put_loop (k + 4) (n + 1)) - else (* Malformed pair: *) - malformed_code k n - else (n, k) - else if (* Normal 2-byte character *) - p = 0xfffe then - (* Wrong byte order mark: It is illegal here *) - malformed_code k n - else ( - (* A regular code point *) - slice_char.(n) <- p; - slice_blen.(n) <- 2; - put_loop (k + 2) (n + 1)) - else (n, k) - in - let n, k = put_loop 0 n_start in - if n < Array.length slice_char then (* EOF marker *) - slice_char.(n) <- -1; - (n, k, enc) - in - { read } - -let get_endianess ops s_in p_in = - let open Netstring_tstring in - let c0 = ops.get s_in (p_in + 0) in - let c1 = ops.get s_in (p_in + 1) in - if c0 = '\254' && c1 = '\255' then `Big_endian - else if c0 = '\255' && c1 = '\254' then `Little_endian - else `No_BOM - -(* expose_bom: when true, the BOM is considered as a character and - * put as value (-3) into slice_char - *) - -let read_utf16 expose_bom = - let read ops slice_char slice_blen s_in p_in l_in = - let open Netstring_tstring in - assert (Array.length slice_char = Array.length slice_blen); - assert (p_in >= 0 && p_in + l_in <= ops.length s_in && l_in >= 0); - (* Expect a BOM at the beginning of the text *) - if l_in >= 2 then ( - if expose_bom then ( - slice_char.(0) <- -3; - slice_blen.(0) <- 0 (* Later corrected *)); - match get_endianess ops s_in p_in with - | `Big_endian -> - let n_start = if expose_bom then 1 else 0 in - let n, k, enc' = - (read_utf16_lebe 1 0 n_start `Enc_utf16_be).read ops slice_char - slice_blen s_in (p_in + 2) (l_in - 2) - in - if n > 0 then slice_blen.(0) <- slice_blen.(0) + 2; - (n, k + 2, enc') - | `Little_endian -> - let n_start = if expose_bom then 1 else 0 in - let n, k, enc' = - (read_utf16_lebe 0 1 n_start `Enc_utf16_le).read ops slice_char - slice_blen s_in (p_in + 2) (l_in - 2) - in - if n > 0 then slice_blen.(0) <- slice_blen.(0) + 2; - (n, k + 2, enc') - | `No_BOM -> - (* byte order mark missing *) - slice_char.(0) <- -1; - raise (Malformed_code_read (0, 0, `Enc_utf16))) - else ( - slice_char.(0) <- -1; - (0, 0, `Enc_utf16)) - in - { read } - -let read_utf32_lebe little n_start enc = - (* little: whether little endian - * n_start: First cell in slice to use - *) - let read ops slice_char slice_blen s_in p_in l_in = - let open Netstring_tstring in - assert (Array.length slice_char = Array.length slice_blen); - assert (p_in >= 0 && p_in + l_in <= ops.length s_in && l_in >= 0); - - let malformed_code k n = - slice_char.(n) <- -1; - raise (Malformed_code_read (n, k, enc)) - in - - let b0 = if little then 0 else 3 in - let b1 = if little then 1 else 2 in - let b2 = if little then 2 else 1 in - let b3 = if little then 3 else 0 in - - (* k: counts the bytes - * n: counts the characters - *) - let rec put_loop k n = - if k + 3 < l_in && n < Array.length slice_char then ( - let p3 = Char.code (ops.get s_in (p_in + k + b3)) in - if p3 <> 0 then malformed_code k n; - let p = - Char.code (ops.get s_in (p_in + k + b0)) - lor (Char.code (ops.get s_in (p_in + k + b1)) lsl 8) - lor (Char.code (ops.get s_in (p_in + k + b2)) lsl 16) - in - if (p >= 0xD800 && p <= 0xDFFF) || p >= 0x10FFFF then malformed_code k n; - if p = 0xfffe then - (* Wrong byte order mark: It is illegal here *) - malformed_code k n; - slice_char.(n) <- p; - slice_blen.(n) <- 4; - put_loop (k + 4) (n + 1)) - else (n, k) - in - let n, k = put_loop 0 n_start in - if n < Array.length slice_char then (* EOF marker *) - slice_char.(n) <- -1; - (n, k, enc) - in - { read } - -let get_endianess32 ops s_in p_in = - let open Netstring_tstring in - let c0 = ops.get s_in (p_in + 0) in - let c1 = ops.get s_in (p_in + 1) in - let c2 = ops.get s_in (p_in + 2) in - let c3 = ops.get s_in (p_in + 3) in - if c0 = '\000' && c1 = '\000' && c2 = '\254' && c3 = '\255' then `Big_endian - else if c0 = '\255' && c1 = '\254' && c2 = '\000' && c3 = '\000' then - `Little_endian - else `No_BOM - -let read_utf32 expose_bom = - let read ops slice_char slice_blen s_in p_in l_in = - let open Netstring_tstring in - assert (Array.length slice_char = Array.length slice_blen); - assert (p_in >= 0 && p_in + l_in <= ops.length s_in && l_in >= 0); - (* Expect a BOM at the beginning of the text *) - if l_in >= 4 then ( - if expose_bom then ( - slice_char.(0) <- -3; - slice_blen.(0) <- 0 (* Later corrected *)); - match get_endianess32 ops s_in p_in with - | `Big_endian -> - let n_start = if expose_bom then 1 else 0 in - let n, k, enc' = - (read_utf32_lebe false n_start `Enc_utf32_be).read ops slice_char - slice_blen s_in (p_in + 4) (l_in - 4) - in - if n > 0 then slice_blen.(0) <- slice_blen.(0) + 4; - (n, k + 4, enc') - | `Little_endian -> - let n_start = if expose_bom then 1 else 0 in - let n, k, enc' = - (read_utf32_lebe true n_start `Enc_utf32_le).read ops slice_char - slice_blen s_in (p_in + 4) (l_in - 4) - in - if n > 0 then slice_blen.(0) <- slice_blen.(0) + 4; - (n, k + 4, enc') - | `No_BOM -> - (* byte order mark missing *) - slice_char.(0) <- -1; - raise (Malformed_code_read (0, 0, `Enc_utf32))) - else ( - slice_char.(0) <- -1; - (0, 0, `Enc_utf32)) - in - { read } - -let read_euc len1 len2 len3 map1 map2 map3 enc = - (* Code set 0 is US-ASCII. - * Code sets 1, 2, 3 may be anything. lenX = 0: code set is not supported. - * lenX is either 0, 1, or 2. - *) - (* UNSAFE_OPT *) - let open Netstring_tstring in - assert (len1 >= 0 && len1 <= 2); - assert (len2 >= 0 && len2 <= 2); - assert (len3 >= 0 && len3 <= 2); - - let read ops slice_char slice_blen s_in p_in l_in = - assert (Array.length slice_char = Array.length slice_blen); - assert (p_in >= 0 && p_in + l_in <= ops.length s_in && l_in >= 0); - - (* k: counts the bytes - * n: counts the characters - *) - let p = ref p_in in - let p_max = p_in + l_in in - let n = ref 0 in - let n_ret = ref (-1) in - - let malformed_code () = - slice_char.(!n) <- -1; - raise (Malformed_code_read (!n, !p - p_in, enc)) - in - - let slice_length = Array.length slice_char in - - while !p < p_max && !n < slice_length do - let k_inc = - (* length of the character in bytes; 0 means: stop *) - - (* We know: - * (1) p_in >= 0 ==> !p >= 0 - * (2) !p < p_max = p_in + l_in <= String.length s_in - * ==> unsafe get ok - *) - (* match s_in.[k_in + k] with *) - match ops.unsafe_get s_in !p with - | '\000' .. '\127' as x -> - (* US-ASCII *) - Array.unsafe_set slice_char !n (Char.code x); - (* ok *) - 1 - | '\142' -> - (* Code set 2 *) - if len2 = 0 then malformed_code (); - if !p + len2 >= p_max then 0 - else - let x1 = Char.code (ops.get s_in (!p + 1)) in - let x2 = - if len2 = 1 then 256 else Char.code (ops.get s_in (!p + 2)) - in - if x1 < 160 || x2 < 160 then malformed_code (); - let uni = map2 x1 x2 in - Array.unsafe_set slice_char !n uni; - (* ok *) - len2 + 1 - | '\143' -> - (* Code set 3 *) - if len3 = 0 then malformed_code (); - if !p + len3 >= p_max then 0 - else - let x1 = Char.code (ops.get s_in (!p + 1)) in - let x2 = - if len3 = 1 then 256 else Char.code (ops.get s_in (!p + 2)) - in - if x1 < 160 || x2 < 160 then malformed_code (); - let uni = map3 x1 x2 in - Array.unsafe_set slice_char !n uni; - (* ok *) - len3 + 1 - | '\160' .. '\255' as x1_code -> - (* Code set 1 *) - if !p + len1 > p_max then 0 - else - let x1 = Char.code x1_code in - let x2 = - if len1 = 1 then 256 else Char.code (ops.get s_in (!p + 1)) - in - if x2 < 160 then malformed_code (); - let uni = map1 x1 x2 in - Array.unsafe_set slice_char !n uni; - (* ok *) - len1 - | _ -> - (* illegal *) - malformed_code () - in - - (* If k_inc = 0, the character was partially outside the processed - * range of the string, and could not be decoded. - *) - if k_inc > 0 then ( - (* We know: - * (1) n >= 0, because n starts with 0 and is only increased - * (2) n < Array.length slice_char = Array.length slice_blen - * ==> unsafe set ok - *) - (* slice_blen.(n) <- k_inc; *) - Array.unsafe_set slice_blen !n k_inc; - (* next iteration: *) - p := !p + k_inc; - incr n) - else ( - (* Stop loop: *) - n_ret := !n; - n := slice_length) - done; - - if !n_ret = -1 then n_ret := !n; - if !n_ret < slice_length then (* EOF marker *) - slice_char.(!n_ret) <- -1; - (!n_ret, !p - p_in, enc) - in - { read } - -let read_eucjp () = - let jis0201 = Netmappings.get_to_unicode "jis0201" in - let jis0208 = Netmappings.get_to_unicode "jis0208" in - let jis0212 = lazy (Netmappings.get_to_unicode "jis0212") in - (* seldom *) - let map1 x1 x2 = jis0208.(((x1 - 160) * 96) + x2 - 160) in - let map2 x1 _ = jis0201.(x1) in - let map3 x1 x2 = (Lazy.force jis0212).(((x1 - 160) * 96) + x2 - 160) in - read_euc 2 1 2 map1 map2 map3 `Enc_eucjp - -let read_euckr () = - let ks1001 = Netmappings.get_to_unicode "ks1001" in - let map x1 x2 = ks1001.(((x1 - 160) * 96) + x2 - 160) in - read_euc 2 0 0 map map map `Enc_euckr - let read_subset inner_read def = let read ops slice_char slice_blen s_in p_in l_in = let open Netstring_tstring in @@ -1152,148 +297,6 @@ let read_subset inner_read def = * write_XXX slice_char slice_length s_out p_out l_out subst *) -let write_iso88591 maxcode slice_char slice_pos slice_length s_out p_out l_out - subst = - (* UNSAFE_OPT *) - (* Use maxcode=255 for ISO-8859-1, and maxcode=127 for US-ASCII, - * and maxcode=(-1) for `Enc_empty. - *) - assert (p_out >= 0 && p_out + l_out <= Bytes.length s_out && l_out >= 0); - assert (slice_pos >= 0 && slice_pos + slice_length <= Array.length slice_char); - assert (maxcode <= 255); - - let n = ref slice_pos in - (* index of slice *) - let n_ret = ref (-1) in - (* returned number of characters *) - let n_max = slice_pos + slice_length in - - let p = ref p_out in - (* current output position *) - let p_max = p_out + l_out in - - (* maximum output position *) - while !n < n_max && !p < p_max do - (* We know: - * (1) !n >= 0, because it starts with 0 and is only increased - * (2) !n < n_max = slice_pos + slice_length <= Array.length slice_char - * ==> unsafe get ok - *) - let ch = Array.unsafe_get slice_char !n in - if ch >= 0 && ch <= maxcode then ( - (* Because !p < p_max: - * !p < p_max = p_out + l_out <= String.length s_out - * Furthermore, p_out >= 0, !p >= 0. - * ==> unsafe set ok - *) - (* s_out.[ !p ] <- Char.chr ch; *) - Bytes.unsafe_set s_out !p (Char.unsafe_chr ch); - incr n; - incr p) - else ( - assert (ch >= 0); - let replacement = subst ch in - let l_repl = String.length replacement in - if l_repl > multibyte_limit then - failwith "Netconversion.write_iso88591: Substitution string too long"; - if !p + l_repl <= p_max then ( - (* Enough space to store 'replacement': *) - Bytes.blit_string replacement 0 s_out !p l_repl; - p := !p + l_repl; - incr n) - else ( - (* Exit whole conversion *) - n_ret := !n; - n := n_max)) - done; - if !n_ret >= 0 then (!n_ret - slice_pos, !p - p_out) - else (!n - slice_pos, !p - p_out) - -let get_8bit_from_unicode_map enc = - let cs = - match required_charsets enc with - | [ cs ] -> cs - | _ -> failwith "get_8bit_from_unicode_map" - in - let from_unicode = Netmappings.get_from_unicode (internal_name cs) in - assert (Array.length from_unicode = 256); - from_unicode - -let write_8bit enc = - (* UNSAFE_OPT *) - let m_from_unicode = get_8bit_from_unicode_map enc in - let m_mask = Array.length m_from_unicode - 1 in - - fun slice_char slice_pos slice_length s_out p_out l_out subst -> - assert (p_out >= 0 && p_out + l_out <= Bytes.length s_out && l_out >= 0); - assert ( - slice_pos >= 0 && slice_pos + slice_length <= Array.length slice_char); - - let n = ref slice_pos in - (* index of slice *) - let n_max = slice_pos + slice_length in - - let k = ref 0 in - (* written bytes *) - let n_ret = ref (-1) in - - (* returned number of characters *) - while !n < n_max && !k < l_out do - (* We know: - * (1) !n >= 0, because it starts with 0 and is only increased - * (2) !n < n_max = slice_pos + slice_length <= Array.length slice - * ==> unsafe get ok - *) - let p = - (* slice_char.( !n ) *) - Array.unsafe_get slice_char !n - in - let p' = - match Array.unsafe_get m_from_unicode (p land m_mask) with - | Netmappings.U_nil -> -1 - | Netmappings.U_single (p0, q0) -> if p0 = p then q0 else -1 - | Netmappings.U_double (p0, q0, p1, q1) -> - if p0 = p then q0 else if p1 = p then q1 else -1 - | Netmappings.U_array pq -> - let r = ref (-1) in - let h = ref 0 in - while !r < 0 && !h < Array.length pq do - if pq.(!h) = p then r := pq.(!h + 1) else h := !h + 2 - done; - !r - in - - (* If p=-1 ==> p'=-1, because -1 is never mapped to any code point *) - if p' < 0 then ( - if p < 0 then assert false (* EOF mark found *) - else - let replacement = subst p in - let l_repl = String.length replacement in - if l_repl > multibyte_limit then - failwith "Netconversion.write_8bit: Substitution string too long"; - - if !k + l_repl <= l_out then ( - (* Enough space to store 'replacement': *) - Bytes.blit_string replacement 0 s_out (p_out + !k) l_repl; - k := !k + l_repl; - incr n) - else ( - (* Exit whole conversion *) - n_ret := !n; - n := n_max)) - else ( - (* Because !k < l_out: - * p_out + !k < p_out + l_out <= String.length s_out - * Furthermore, p_out >= 0, !k >= 0. - * ==> unsafe set ok - *) - (* s_out.[ p_out + !k ] <- Char.chr p'; *) - Bytes.unsafe_set s_out (p_out + !k) (Char.unsafe_chr (p' land 0xff)); - incr n; - incr k) - done; - if !n_ret >= 0 then (!n_ret - slice_pos, !k) else (!n - slice_pos, !k) - let write_utf8 is_java slice_char slice_pos slice_length s_out p_out l_out subst = (* UNSAFE_OPT *) @@ -1413,292 +416,6 @@ let write_utf8 is_java slice_char slice_pos slice_length s_out p_out l_out subst done; if !n_ret >= 0 then (!n_ret - slice_pos, !k) else (!n - slice_pos, !k) -let write_utf16_lebe lo hi slice_char slice_pos slice_length s_out p_out l_out - subst = - (* lo=0, hi=1: little endian - * lo=1, hi=0: big endian - *) - assert (p_out >= 0 && p_out + l_out <= Bytes.length s_out && l_out >= 0); - assert (slice_pos >= 0 && slice_pos + slice_length <= Array.length slice_char); - - let n = ref slice_pos in - (* index of slice *) - let n_max = slice_pos + slice_length in - - let k = ref 0 in - (* written bytes *) - let n_ret = ref (-1) in - - (* returned number of characters *) - while !n < n_max do - let p = slice_char.(!n) in - - let index = p_out + !k in - - let k_inc = - if p >= 0xfffe then ( - if p <= 0x10ffff then ( - if p <= 0xffff then failwith "Netconversion.write_utf16_le"; - (* Must be written as surrogate pair *) - if !k + 3 < l_out then ( - let high = ((p - 0x10000) lsr 10) + 0xd800 in - let low = (p land 0x3ff) + 0xdc00 in - Bytes.set s_out (index + lo) (Char.chr (high land 0xff)); - Bytes.set s_out (index + hi) (Char.chr (high lsr 8)); - Bytes.set s_out (index + 2 + lo) (Char.chr (low land 0xff)); - Bytes.set s_out (index + 2 + hi) (Char.chr (low lsr 8)); - 4) - else -1) - else - (* Higher code points are not possible in XML; call subst *) - let replacement = subst p in - let l_repl = String.length replacement in - if l_repl > multibyte_limit then - failwith - "Netconversion.write_utf16_le: Substitution string too long"; - if !k + l_repl <= l_out then ( - (* Enough space to store 'replacement': *) - Bytes.blit_string replacement 0 s_out (p_out + !k) l_repl; - l_repl (* may be 0! *)) - else -1 (* Exit whole conversion *)) - else if (* 2-byte character *) - !k + 1 < l_out then ( - Bytes.set s_out (index + lo) (Char.unsafe_chr (p land 0xff)); - Bytes.set s_out (index + hi) (Char.unsafe_chr ((p lsr 8) land 0xff)); - 2) - else -1 - in - if k_inc >= 0 then ( - k := !k + k_inc; - incr n) - else ( - n_ret := !n; - n := n_max) - done; - if !n_ret >= 0 then (!n_ret - slice_pos, !k) else (!n - slice_pos, !k) - -let write_utf32_lebe little slice_char slice_pos slice_length s_out p_out l_out - subst = - assert (p_out >= 0 && p_out + l_out <= Bytes.length s_out && l_out >= 0); - assert (slice_pos >= 0 && slice_pos + slice_length <= Array.length slice_char); - - let n = ref slice_pos in - (* index of slice *) - let n_max = slice_pos + slice_length in - - let k = ref 0 in - (* written bytes *) - let n_ret = ref (-1) in - - (* returned number of characters *) - let b0 = if little then 0 else 3 in - let b1 = if little then 1 else 2 in - let b2 = if little then 2 else 1 in - let b3 = if little then 3 else 0 in - - while !n < n_max do - let p = slice_char.(!n) in - - let index = p_out + !k in - - let k_inc = - if p <= 0x10ffff then - if !k + 3 < l_out then ( - Bytes.set s_out (index + b0) (Char.unsafe_chr (p land 0xff)); - Bytes.set s_out (index + b1) (Char.unsafe_chr ((p lsr 8) land 0xff)); - Bytes.set s_out (index + b2) (Char.unsafe_chr ((p lsr 16) land 0xff)); - Bytes.set s_out (index + b3) (Char.unsafe_chr 0); - 4) - else -1 - else - (* Higher code points are not possible in XML; call subst *) - let replacement = subst p in - let l_repl = String.length replacement in - if l_repl > multibyte_limit then - failwith "Netconversion.write_utf32: Substitution string too long"; - if !k + l_repl <= l_out then ( - (* Enough space to store 'replacement': *) - Bytes.blit_string replacement 0 s_out (p_out + !k) l_repl; - l_repl (* may be 0! *)) - else -1 - (* Exit whole conversion *) - in - if k_inc >= 0 then ( - k := !k + k_inc; - incr n) - else ( - n_ret := !n; - n := n_max) - done; - if !n_ret >= 0 then (!n_ret - slice_pos, !k) else (!n - slice_pos, !k) - -let write_euc map _enc - (* Code set 0 is US-ASCII. - * let (set, byte1, byte2) = map unicode: - * - set is 1, 2, 3, or 4. 4 means that the code point cannot be mapped. - * - byte1 >= 160, <= 255 - * - byte2 >= 160, <= 255, or byte2=256 meaning that it is not used - *) - (* UNSAFE_OPT *) - slice_char slice_pos slice_length s_out p_out l_out subst = - assert (p_out >= 0 && p_out + l_out <= Bytes.length s_out && l_out >= 0); - assert (slice_pos >= 0 && slice_pos + slice_length <= Array.length slice_char); - - let n = ref slice_pos in - (* index of slice *) - let n_max = slice_pos + slice_length in - - let k = ref 0 in - (* written bytes *) - let n_ret = ref (-1) in - - (* returned number of characters *) - while !n < n_max do - (* We know: - * (1) !n >= 0, because it starts with 0 and is only increased - * (2) !n < n_max = slice_pos + slice_length <= Array.length slice - * ==> unsafe get ok - *) - let p = - (* slice.( !n ) *) - Array.unsafe_get slice_char !n - in - assert (p >= 0); - - let index = p_out + !k in - - let set, b1, b2 = if p <= 127 then (0, p, 256) else map p in - - let k_inc = - (* k_inc: how many bytes are written *) - match set with - | 0 -> - if !k < l_out then ( - (* s_out.[index] <- Char.chr p; *) - Bytes.unsafe_set s_out index (Char.unsafe_chr (b1 land 127)); - 1) - else -1 - | 1 -> - let bl = if b2 = 256 then 1 else 2 in - if !k + bl < l_out then ( - assert (b1 >= 160 && b1 <= 255 && b2 >= 160 && b2 <= 256); - Bytes.set s_out index (Char.chr b1); - if b2 <> 256 then Bytes.set s_out (index + 1) (Char.chr b2); - bl) - else -1 - | 2 -> - let bl = if b2 = 256 then 2 else 3 in - if !k + bl < l_out then ( - assert (b1 >= 160 && b1 <= 255 && b2 >= 160 && b2 <= 256); - Bytes.set s_out index '\142'; - Bytes.set s_out (index + 1) (Char.chr b1); - if b2 <> 256 then Bytes.set s_out (index + 2) (Char.chr b2); - bl) - else -1 - | 3 -> - let bl = if b2 = 256 then 2 else 3 in - if !k + bl < l_out then ( - assert (b1 >= 160 && b1 <= 255 && b2 >= 160 && b2 <= 256); - Bytes.set s_out index '\143'; - Bytes.set s_out (index + 1) (Char.chr b1); - if b2 <> 256 then Bytes.set s_out (index + 2) (Char.chr b2); - bl) - else -1 - | 4 -> - let replacement = subst p in - let l_repl = String.length replacement in - if l_repl > multibyte_limit then - failwith "Netconversion.write_euc: Substitution string too long"; - if !k + l_repl <= l_out then ( - (* Enough space to store 'replacement': *) - Bytes.blit_string replacement 0 s_out (p_out + !k) l_repl; - l_repl) - else -1 - (* Exit whole conversion *) - | _ -> assert false - in - if k_inc >= 0 then ( - k := !k + k_inc; - incr n) - else ( - n_ret := !n; - n := n_max) - done; - if !n_ret >= 0 then (!n_ret - slice_pos, !k) else (!n - slice_pos, !k) - -let write_eucjp () = - let jis0201 = Netmappings.get_from_unicode "jis0201" in - let jis0208 = Netmappings.get_from_unicode "jis0208" in - let jis0212 = Netmappings.get_from_unicode "jis0212" in - - let jis0201_mask = Array.length jis0201 - 1 in - let jis0208_mask = Array.length jis0208 - 1 in - let jis0212_mask = Array.length jis0212 - 1 in - - let map p = - (* Try in order: jis0208, jis0201, jis0212 *) - let map_tbl jistbl jistbl_mask = - match jistbl.(p land jistbl_mask) with - | Netmappings.U_nil -> -1 - | Netmappings.U_single (p0, q0) -> if p0 = p then q0 else -1 - | Netmappings.U_double (p0, q0, p1, q1) -> - if p0 = p then q0 else if p1 = p then q1 else -1 - | Netmappings.U_array pq -> - let r = ref (-1) in - let h = ref 0 in - while !r < 0 && !h < Array.length pq do - if pq.(!h) = p then r := pq.(!h + 1) else h := !h + 2 - done; - !r - in - let cp_0208 = map_tbl jis0208 jis0208_mask in - if cp_0208 >= 0 then - let row = cp_0208 / 96 in - let col = cp_0208 - (row * 96) in - (1, row + 160, col + 160) - else - let cp_0201 = map_tbl jis0201 jis0201_mask in - if cp_0201 >= 128 then (* Ignore especially 0x5c, 0x7e *) - (2, cp_0201, 256) - else - let cp_0212 = map_tbl jis0212 jis0212_mask in - if cp_0212 >= 0 then - let row = cp_0212 / 96 in - let col = cp_0212 - (row * 96) in - (3, row + 160, col + 160) - else (4, 256, 256) - in - write_euc map `Enc_eucjp - -let write_euckr () = - let ks1001 = Netmappings.get_from_unicode "ks1001" in - - let ks1001_mask = Array.length ks1001 - 1 in - - let map p = - let map_tbl kstbl kstbl_mask = - match kstbl.(p land kstbl_mask) with - | Netmappings.U_nil -> -1 - | Netmappings.U_single (p0, q0) -> if p0 = p then q0 else -1 - | Netmappings.U_double (p0, q0, p1, q1) -> - if p0 = p then q0 else if p1 = p then q1 else -1 - | Netmappings.U_array pq -> - let r = ref (-1) in - let h = ref 0 in - while !r < 0 && !h < Array.length pq do - if pq.(!h) = p then r := pq.(!h + 1) else h := !h + 2 - done; - !r - in - let cp_1001 = map_tbl ks1001 ks1001_mask in - if cp_1001 >= 0 then - let row = cp_1001 / 96 in - let col = cp_1001 - (row * 96) in - (1, row + 160, col + 160) - else (4, 256, 256) - in - write_euc map `Enc_euckr - let special_cpoint = 0x110000 let write_subset inner_writer def slice_char slice_pos slice_length s_out p_out @@ -1724,67 +441,29 @@ let write_subset inner_writer def slice_char slice_pos slice_length s_out p_out inner_writer slice_char' 0 slice_length s_out p_out l_out subst' -type encoding1 = [ encoding | `Enc_utf16_bom | `Enc_utf32_bom | `Enc_utf8_bom ] - (* `Enc_*_bom considers the BOM as a character with code point -3. * This encoding is only internally used. *) -let rec get_reader1 (enc : encoding1) = +let rec get_reader1 (enc : encoding) = (* get_reader1 supports the additional internal encodings of * encoding1. get_reader (below) only supports the exported * encodings. *) match enc with - | `Enc_iso88591 -> !read_iso88591_ref 255 `Enc_iso88591 - | `Enc_usascii -> !read_iso88591_ref 127 `Enc_usascii - | `Enc_empty -> !read_iso88591_ref (-1) `Enc_empty | `Enc_utf8 -> !read_utf8_ref false - | `Enc_java -> !read_utf8_ref true - | `Enc_utf8_opt_bom -> read_utf8_opt_bom false - | `Enc_utf8_bom -> read_utf8_opt_bom true - | `Enc_utf16 -> read_utf16 false - | `Enc_utf16_bom -> read_utf16 true - | `Enc_utf16_le -> read_utf16_lebe 0 1 0 `Enc_utf16_le - | `Enc_utf16_be -> read_utf16_lebe 1 0 0 `Enc_utf16_be - | `Enc_utf32 -> read_utf32 false - | `Enc_utf32_bom -> read_utf32 true - | `Enc_utf32_le -> read_utf32_lebe true 0 `Enc_utf32_le - | `Enc_utf32_be -> read_utf32_lebe false 0 `Enc_utf32_be - | `Enc_eucjp -> read_eucjp () - | `Enc_euckr -> read_euckr () | `Enc_subset (e, def) -> - let reader' = get_reader1 (e :> encoding1) in + let reader' = get_reader1 e in read_subset reader' def - | #encoding as e -> read_8bit (e :> encoding) -let get_reader = (get_reader1 : encoding1 -> 'a :> encoding -> 'a) +let get_reader = (get_reader1 : encoding -> 'a :> encoding -> 'a) let rec get_writer enc = match enc with - | `Enc_iso88591 -> write_iso88591 255 - | `Enc_usascii -> write_iso88591 127 - | `Enc_empty -> write_iso88591 (-1) | `Enc_utf8 -> write_utf8 false - | `Enc_java -> write_utf8 true - | `Enc_utf16 -> - failwith - "Netconversion: Cannot output text as `Enc_utf16, use `Enc_utf16_le or \ - `Enc_utf16_be" - | `Enc_utf16_le -> write_utf16_lebe 0 1 - | `Enc_utf16_be -> write_utf16_lebe 1 0 - | `Enc_utf32 -> - failwith - "Netconversion: Cannot output text as `Enc_utf32, use `Enc_utf32_le or \ - `Enc_utf32_be" - | `Enc_utf32_le -> write_utf32_lebe true - | `Enc_utf32_be -> write_utf32_lebe false - | `Enc_eucjp -> write_eucjp () - | `Enc_euckr -> write_euckr () | `Enc_subset (e, def) -> let writer' = get_writer e in write_subset writer' def - | _ -> write_8bit enc let recode_poly ~in_ops ~in_enc ~in_buf ~in_pos ~in_len ~out_enc ~out_buf ~out_pos ~out_len ~max_chars ~subst = @@ -1883,32 +562,10 @@ let rec ustring_of_uchar enc = Bytes.sub_string s 0 n_act in match enc with - | `Enc_iso88591 -> - fun p -> - if p > 255 then raise (Cannot_represent p); - String.make 1 (Char.chr p) - | `Enc_usascii -> - fun p -> - if p > 127 then raise (Cannot_represent p); - String.make 1 (Char.chr p) - | `Enc_utf8 | `Enc_utf8_opt_bom -> multi_byte (write_utf8 false) 4 - | `Enc_java -> multi_byte (write_utf8 true) 4 - | `Enc_utf16_le -> multi_byte (write_utf16_lebe 0 1) 4 - | `Enc_utf16_be -> multi_byte (write_utf16_lebe 1 0) 4 - | `Enc_utf16 -> - invalid_arg "Netconversion.ustring_of_uchar: UTF-16 not possible" - | `Enc_utf32_le -> multi_byte (write_utf32_lebe true) 4 - | `Enc_utf32_be -> multi_byte (write_utf32_lebe false) 4 - | `Enc_utf32 -> - invalid_arg "Netconversion.ustring_of_uchar: UTF-32 not possible" - | `Enc_eucjp -> multi_byte (write_eucjp ()) 3 - | `Enc_euckr -> multi_byte (write_euckr ()) 2 + | `Enc_utf8 -> multi_byte (write_utf8 false) 4 | `Enc_subset (e, def) -> fun p -> if def p then ustring_of_uchar e p else raise (Cannot_represent p) - | _ -> - let writer = write_8bit enc in - multi_byte writer 1 let makechar enc = let us = ustring_of_uchar enc in diff --git a/ocamlnet_lite/netconversion.mli b/ocamlnet_lite/netconversion.mli index 3a9d921..615e4b7 100644 --- a/ocamlnet_lite/netconversion.mli +++ b/ocamlnet_lite/netconversion.mli @@ -273,119 +273,12 @@ exception Cannot_represent of int type encoding = [ `Enc_utf8 (* UTF-8 *) - | `Enc_utf8_opt_bom - | `Enc_java (* The variant of UTF-8 used by Java *) - | `Enc_utf16 (* UTF-16 with unspecified endianess (restricted usage) *) - | `Enc_utf16_le (* UTF-16 little endian *) - | `Enc_utf16_be (* UTF-16 big endian *) - | `Enc_utf32 (* UTF-32 with unspecified endianess (restricted usage) *) - | `Enc_utf32_le (* UTF-32 little endian *) - | `Enc_utf32_be (* UTF-32 big endian *) - | `Enc_usascii (* US-ASCII (only 7 bit) *) - | `Enc_iso88591 (* ISO-8859-1 *) - | `Enc_iso88592 (* ISO-8859-2 *) - | `Enc_iso88593 (* ISO-8859-3 *) - | `Enc_iso88594 (* ISO-8859-4 *) - | `Enc_iso88595 (* ISO-8859-5 *) - | `Enc_iso88596 (* ISO-8859-6 *) - | `Enc_iso88597 (* ISO-8859-7 *) - | `Enc_iso88598 (* ISO-8859-8 *) - | `Enc_iso88599 (* ISO-8859-9 *) - | `Enc_iso885910 (* ISO-8859-10 *) - | `Enc_iso885911 (* ISO-8859-11 *) - | `Enc_iso885913 (* ISO-8859-13 *) - | `Enc_iso885914 (* ISO-8859-14 *) - | `Enc_iso885915 (* ISO-8859-15 *) - | `Enc_iso885916 (* ISO-8859-16 *) - | `Enc_koi8r (* KOI8-R *) - | `Enc_jis0201 (* JIS-X-0201 (Roman in lower half; Katakana upper half *) - | `Enc_eucjp (* EUC-JP (includes US-ASCII, JIS-X-0201, -0208, -0212) *) - | (* Japanese, TODO: *) - (*| `Enc_iso2022jp of jis_state = [ `Enc_usascii | `Enc_jis0201 | - `Enc_jis0208_1978 | `Enc_jis0208_1893 ] - It is very likely that ISO-2022 will be handled in a different module. - This encoding is too weird. - | `Enc_sjis - *) - `Enc_euckr - (* EUC-KR (includes US-ASCII, KS-X-1001) *) - | (* Older standards: *) - `Enc_asn1_iso646 - (* only the language-neutral subset - "IA5String" *) - | `Enc_asn1_T61 (* ITU T.61 ("Teletex") *) - | `Enc_asn1_printable (* ASN.1 Printable *) - | (* Microsoft: *) - `Enc_windows1250 (* WINDOWS-1250 *) - | `Enc_windows1251 (* WINDOWS-1251 *) - | `Enc_windows1252 (* WINDOWS-1252 *) - | `Enc_windows1253 (* WINDOWS-1253 *) - | `Enc_windows1254 (* WINDOWS-1254 *) - | `Enc_windows1255 (* WINDOWS-1255 *) - | `Enc_windows1256 (* WINDOWS-1256 *) - | `Enc_windows1257 (* WINDOWS-1257 *) - | `Enc_windows1258 (* WINDOWS-1258 *) - | (* IBM, ASCII-based: *) - `Enc_cp437 - | `Enc_cp737 - | `Enc_cp775 - | `Enc_cp850 - | `Enc_cp852 - | `Enc_cp855 - | `Enc_cp856 - | `Enc_cp857 - | `Enc_cp860 - | `Enc_cp861 - | `Enc_cp862 - | `Enc_cp863 - | `Enc_cp864 - | `Enc_cp865 - | `Enc_cp866 - | `Enc_cp869 - | `Enc_cp874 - | `Enc_cp1006 - | (* IBM, EBCDIC-based: *) - `Enc_cp037 - | `Enc_cp424 - | `Enc_cp500 - | `Enc_cp875 - | `Enc_cp1026 - | `Enc_cp1047 - | (* Adobe: *) - `Enc_adobe_standard_encoding - | `Enc_adobe_symbol_encoding - | `Enc_adobe_zapf_dingbats_encoding - | (* Apple: *) - `Enc_macroman | (* Encoding subset: *) - `Enc_subset of encoding * (int -> bool) - | `Enc_empty (* does not encode any character *) ] + `Enc_subset of encoding * (int -> bool) ] (** The polymorphic variant enumerating the supported encodings. We have: * - [`Enc_utf8]: UTF-8 - * - [`Enc_utf8_opt_bom]: UTF-8 with an optional byte order mark at the - * beginning of the text - * - [`Enc_java]: The UTF-8 variant used by Java (the only difference is - * the representation of NUL) - * - [`Enc_utf16]: UTF-16 with unspecified endianess (restricted) - * - [`Enc_utf16_le]: UTF-16 little endian - * - [`Enc_utf16_be]: UTF-16 big endian - * - [`Enc_utf32]: UTF-32 with unspecified endianess (restricted) - * - [`Enc_utf32_le]: UTF-32 little endian - * - [`Enc_utf32_be]: UTF-32 big endian - * - [`Enc_usascii]: US-ASCII (7 bits) - * - [`Enc_iso8859]{i n}: ISO-8859-{i n} - * - [`Enc_koi8r]: KOI8-R - * - [`Enc_jis0201]: JIS-X-0201 (Roman and Katakana) - * - [`Enc_eucjp]: EUC-JP (code points from US-ASCII, JIS-X-0202, -0208, and - * -0212) - * - [`Enc_euckr]: EUC-KR (code points from US-ASCII, KS-X-1001) - * - [`Enc_windows]{i n}: WINDOWS-{i n} - * - [`Enc_cp]{i n}: IBM code page {i n}. Note that there are both ASCII- - * and EBCDIC-based code pages - * - [`Enc_adobe_*]: Adobe-specific encodings, e.g. used in Adobe fonts - * - [`Enc_mac*]: Macintosh-specific encodings * - [`Enc_subset(e,def)]: The subset of [e] by applying the definition * function [def] - * - [`Enc_empty]: The empty encoding (does not represent any character) *) (**********************************************************************) @@ -466,28 +359,6 @@ val convert : * of the input string minus [range_pos]) *) -val is_ascii_compatible : encoding -> bool -(** "ASCII compatible" means: The bytes 1 to 127 represent the ASCII - * codes 1 to 127, and no other representation of a character contains - * the bytes 1 to 127. - * - * For example, ISO-8859-1 is ASCII-compatible because the byte 1 to - * 127 mean the same as in ASCII, and all other characters use bytes - * greater than 127. UTF-8 is ASCII-compatible for the same reasons, - * it does not matter that there are multi-byte characters. - * EBCDIC is not ASCII-compatible because the bytes 1 to 127 do not mean - * the same as in ASCII. UTF-16 is not ASCII-compatible because the bytes - * 1 to 127 can occur in multi-byte representations of non-ASCII - * characters. - * - * The byte 0 has been excluded from this definition because the C - * language uses it with a special meaning that has nothing to do with - * characters, so it is questionable to interpret the byte 0 anyway. - *) - -val is_single_byte : encoding -> bool -(** Returns whether the encoding is a single-byte encoding *) - val makechar : encoding -> int -> string (** [makechar enc i:] * Creates the string representing the Unicode code point [i] in encoding diff --git a/ocamlnet_lite/netencoding.ml b/ocamlnet_lite/netencoding.ml index 4094163..b238bbd 100644 --- a/ocamlnet_lite/netencoding.ml +++ b/ocamlnet_lite/netencoding.ml @@ -399,132 +399,38 @@ module Html = struct let unsafe_chars_html4 = "<>\"&\000\001\002\003\004\005\006\007\008\011\012\014\015\016\017\018\019\020\021\022\023\024\025\026\027\028\029\030\031\127" - let regexp_ht = Hashtbl.create 7 - - let regexp_set s = - try Hashtbl.find regexp_ht s - with Not_found -> - let re = Netstring_str.regexp (Netstring_str.quote_set s) in - if Hashtbl.length regexp_ht < 100 then - (* avoid leak *) - Hashtbl.replace regexp_ht s re; - re - - (* The functions [encode_quickly] and [encode_ascii] are special cases of - * [encode] that can be implemented by regular expressions. - *) - - let encode_quickly ~prefer_name ~unsafe_chars () = - (* Preconditions: in_enc = out_enc, and the encoding must be a single-byte, - * ASCII-compatible encoding. - *) - if unsafe_chars = "" then fun s -> s - else - let unsafe_re = regexp_set unsafe_chars in - Netstring_str.global_substitute unsafe_re (fun r s -> - let t = Netstring_str.matched_string r s in - let p = Char.code t.[0] in - (* p is an ASCII code point *) - let name = rev_etable.(p) in - if prefer_name && name <> "" then name - else "&#" ^ string_of_int p ^ ";") - - let encode_quickly_poly ~prefer_name ~unsafe_chars ~ops ~out_kind () = - Netstring_tstring.polymorph_string_transformation - (encode_quickly ~prefer_name ~unsafe_chars ()) - ops out_kind - - let msb_set = - let s = Bytes.create 128 in - for k = 0 to 127 do - Bytes.set s k (Char.chr (128 + k)) - done; - Bytes.unsafe_to_string s - - let encode_ascii ~in_enc ~prefer_name ~unsafe_chars () = - (* Preconditions: out_enc = `Enc_usascii, and in_enc must be a single-byte, - * ASCII-compatible encoding. - *) - let unsafe_chars1 = unsafe_chars ^ msb_set in - let unsafe_re = regexp_set unsafe_chars1 in - (* unicode_of.[q] = p: the code point q+128 of in_enc is the same as the - * Unicode code point p - *) - let unicode_of = Array.make 128 (-1) in - for i = 0 to 127 do - try - let s = String.make 1 (Char.chr (i + 128)) in - let u = Netconversion.uarray_of_ustring in_enc s in - match u with [| u0 |] -> unicode_of.(i) <- u0 | _ -> assert false - with Netconversion.Malformed_code -> unicode_of.(i) <- -1 - done; - Netstring_str.global_substitute unsafe_re (fun r s -> - let t = Netstring_str.matched_string r s in - (* p is the code point in the encoding ~in_enc; p' is the Unicode - * code point: - *) - let p = Char.code t.[0] in - let p' = if p < 128 then p else unicode_of.(p - 128) in - if p' < 0 then raise Netconversion.Malformed_code; - let name = - if prefer_name then - if p' <= 255 then rev_etable.(p') - else try Hashtbl.find rev_etable_rest p' with Not_found -> "" - else "" - in - if name = "" then "&#" ^ string_of_int p' ^ ";" else name) - - let encode_ascii_poly ~in_enc ~prefer_name ~unsafe_chars ~ops ~out_kind () = - Netstring_tstring.polymorph_string_transformation - (encode_ascii ~in_enc ~prefer_name ~unsafe_chars ()) - ops out_kind - - let encode_poly ~in_enc ~in_ops ~out_kind ?(out_enc = `Enc_usascii) + let encode_poly ~in_enc ~in_ops ~out_kind ?(out_enc = `Enc_utf8) ?(prefer_name = true) ?(unsafe_chars = unsafe_chars_html4) () = (* This function implements the general case *) - (* Check arguments: *) - if not (Netconversion.is_ascii_compatible out_enc) then - invalid_arg "Netencoding.Html.encode: out_enc not ASCII-compatible"; for i = 0 to String.length unsafe_chars - 1 do if Char.code unsafe_chars.[i] >= 128 then invalid_arg "Netencoding.Html.encode: non-ASCII character in unsafe_chars" done; (* Are there better implementations than the general one? *) - let in_single = Netconversion.is_single_byte in_enc in - let in_subset = - match in_enc with `Enc_subset (_, _) -> true | _ -> false - in - if (not in_subset) && in_enc = out_enc && in_single then - encode_quickly_poly ~prefer_name ~unsafe_chars ~ops:in_ops ~out_kind () - else if (not in_subset) && out_enc = `Enc_usascii && in_single then - encode_ascii_poly ~in_enc ~prefer_name ~unsafe_chars ~ops:in_ops ~out_kind - () - else - (* ... only the general implementation is applicable. *) - (* Create the domain function: *) - let dom_array = Array.make 128 true in - let dom p = p >= 128 || dom_array.(p) in - (* Set dom_array from unsafe_chars: *) - for i = 0 to String.length unsafe_chars - 1 do - let c = Char.code unsafe_chars.[i] in - dom_array.(c) <- false - done; - (* Create the substitution function: *) - let subst p = - let name = - if prefer_name then - if p <= 255 then rev_etable.(p) - else try Hashtbl.find rev_etable_rest p with Not_found -> "" - else "" - in - if name = "" then "&#" ^ string_of_int p ^ ";" else name + (* Create the domain function: *) + let dom_array = Array.make 128 true in + let dom p = p >= 128 || dom_array.(p) in + (* Set dom_array from unsafe_chars: *) + for i = 0 to String.length unsafe_chars - 1 do + let c = Char.code unsafe_chars.[i] in + dom_array.(c) <- false + done; + (* Create the substitution function: *) + let subst p = + let name = + if prefer_name then + if p <= 255 then rev_etable.(p) + else try Hashtbl.find rev_etable_rest p with Not_found -> "" + else "" in - (* Recode: *) - fun s -> - Netconversion.convert_poly ~in_ops ~out_kind ~subst ~in_enc - ~out_enc:(`Enc_subset (out_enc, dom)) - s + if name = "" then "&#" ^ string_of_int p ^ ";" else name + in + (* Recode: *) + fun s -> + Netconversion.convert_poly ~in_ops ~out_kind ~subst ~in_enc + ~out_enc:(`Enc_subset (out_enc, dom)) + s let encode ~in_enc ?out_enc ?prefer_name ?unsafe_chars () = let in_ops = Netstring_tstring.string_ops in @@ -542,10 +448,6 @@ module Html = struct * encoding must be "byte-total" *) function - | `Enc_iso88591 | `Enc_iso88592 | `Enc_iso88593 | `Enc_iso88594 - | `Enc_iso88595 | `Enc_iso88599 | `Enc_iso885910 | `Enc_iso885913 - | `Enc_iso885914 | `Enc_iso885915 | `Enc_iso885916 -> - true | _ -> false let hex_digit_of_char c = @@ -582,9 +484,6 @@ module Html = struct failwith ("Netencoding.Html.decode: Character cannot be represented: " ^ string_of_int p)) ?(entity_base = (`Html : entity_set)) () = - (* Argument checks: *) - if not (Netconversion.is_ascii_compatible in_enc) then - invalid_arg "Netencoding.Html.decode: in_enc not ASCII-compatible"; (* makechar: *) let raw_makechar = Netconversion.makechar out_enc in let makechar p = try raw_makechar p with Not_found -> subst p in @@ -657,5 +556,4 @@ module Html = struct let decode ~in_enc ~out_enc ?lookup ?subst ?entity_base () = let out_kind = Netstring_tstring.String_kind in decode_half_poly ~in_enc ~out_kind ~out_enc ?lookup ?subst ?entity_base () - end From e223ac1a1339278930270b4143e337a5133e66e3 Mon Sep 17 00:00:00 2001 From: Javier Chavarri Date: Tue, 12 Dec 2023 14:20:43 +0000 Subject: [PATCH 7/8] simplify moar --- ocamlnet_lite/netaux.ml | 19 -------------- ocamlnet_lite/netaux.mli | 18 ------------- ocamlnet_lite/netconversion.mli | 8 ------ ocamlnet_lite/netdb.ml | 34 ------------------------ ocamlnet_lite/netdb.mli | 38 --------------------------- ocamlnet_lite/netmappings.ml | 39 ---------------------------- ocamlnet_lite/netmappings.mli | 46 --------------------------------- 7 files changed, 202 deletions(-) delete mode 100644 ocamlnet_lite/netdb.ml delete mode 100644 ocamlnet_lite/netdb.mli delete mode 100644 ocamlnet_lite/netmappings.ml delete mode 100644 ocamlnet_lite/netmappings.mli diff --git a/ocamlnet_lite/netaux.ml b/ocamlnet_lite/netaux.ml index a7a307f..3af5e6e 100644 --- a/ocamlnet_lite/netaux.ml +++ b/ocamlnet_lite/netaux.ml @@ -31,23 +31,4 @@ module ArrayAux = struct let int_blit src srcpos dest destpos len = !int_blit_ref src srcpos dest destpos len - let int_series_ref = - ref - (fun src srcpos dst dstpos len n -> - if (len < 0 || srcpos < 0 || dstpos < 0 || - srcpos+len > Array.length src || - dstpos+len > Array.length dst) - then - invalid_arg "Netaux.ArrayAux.int_series"; - - let s = ref n in - for i = 0 to len-1 do - Array.unsafe_set dst (dstpos+i) !s; - s := !s + Array.unsafe_get src (srcpos+i) - done - ) - - let int_series src srcpos dst dstpos len n = - !int_series_ref src srcpos dst dstpos len n - end diff --git a/ocamlnet_lite/netaux.mli b/ocamlnet_lite/netaux.mli index 69934c8..e04dac5 100644 --- a/ocamlnet_lite/netaux.mli +++ b/ocamlnet_lite/netaux.mli @@ -10,22 +10,4 @@ module ArrayAux : sig (** A specialisation of [Array.blit] for int arrays. * (Performance reasons.) *) - - val int_series : int array -> int -> int array -> int -> int -> int -> unit - (** [int_series src srcpos dst dstpos len n]: - * Computes for every [i], [0 <= i < len]: - * [dst.(dstpos+i) = n + SUM(j=0..(i-1): src.(srcpos+j)) ] - * - * It is expected that [src == dst] implies [srcpos >= dstpos]. - *) - - (**/**) - - val int_blit_ref : - (int array -> int -> int array -> int -> int -> unit) ref - (* Used by [Netaccel] to override the built-in implementation *) - - val int_series_ref : - (int array -> int -> int array -> int -> int -> int -> unit) ref - (* Used by [Netaccel] to override the built-in implementation *) end diff --git a/ocamlnet_lite/netconversion.mli b/ocamlnet_lite/netconversion.mli index 615e4b7..96f6227 100644 --- a/ocamlnet_lite/netconversion.mli +++ b/ocamlnet_lite/netconversion.mli @@ -263,14 +263,6 @@ * - A [uarray] is an [array of int] representing a string *) -exception Malformed_code -(** Raised when an illegal byte sequence is found *) - -exception Cannot_represent of int -(** Raised when a certain Unicode code point cannot be represented in - * the selected output encoding - *) - type encoding = [ `Enc_utf8 (* UTF-8 *) | (* Encoding subset: *) diff --git a/ocamlnet_lite/netdb.ml b/ocamlnet_lite/netdb.ml deleted file mode 100644 index 29bed6b..0000000 --- a/ocamlnet_lite/netdb.ml +++ /dev/null @@ -1,34 +0,0 @@ -let values = Hashtbl.create 13 -let loaders = Hashtbl.create 13 -let cksums = Hashtbl.create 13 -let enabled = ref true - -let read_db name = - let v = - try Hashtbl.find values name - with Not_found -> - if not !enabled then - failwith - ("Ocamlnet: The lookup table `" ^ name - ^ "' is not compiled into the program, and access to " - ^ "the external file database is disabled"); - let loader = - try Hashtbl.find loaders name - with Not_found -> failwith ("Ocamlnet: No such lookup table: " ^ name) - in - loader name - in - try - let cksum = Hashtbl.find cksums name in - if Digest.string v <> cksum then - failwith ("Netdb: checksum error for table: " ^ name); - v - with Not_found -> v - -let exists_db name = - Hashtbl.mem values name || (!enabled && Hashtbl.mem loaders name) - -let set_db name value = Hashtbl.replace values name value -let set_db_checksum name cksum = Hashtbl.replace cksums name cksum -let set_db_loader name loader = Hashtbl.replace loaders name loader -let enable_db_loaders b = enabled := b diff --git a/ocamlnet_lite/netdb.mli b/ocamlnet_lite/netdb.mli deleted file mode 100644 index 4559185..0000000 --- a/ocamlnet_lite/netdb.mli +++ /dev/null @@ -1,38 +0,0 @@ -(* This is an internal interface of ocamlnet! Do not use outside! *) - -(* This module manages persistent values (often lookup tables). These - * values can be stored in external files, or they can be initialized - * from string values. - *) - -val read_db : string -> string - (* Reads the value with the given name, and returns it. - * - * First it is checked whether there was a set_db call, and if so, - * this value is unmarshalled and returned. Otherwise, it is checked - * whether there is a loader, and if so, it is called. - * - * In both cases the checksum is checked. - *) - -val exists_db : string -> bool - (* Checks whether the named value is available, i.e. read_db would - * be able to find it - *) - -val set_db_checksum : string -> string -> unit - (* [set_db_checksum key cksum]: sets the MD5 digest of this key *) - -val set_db : string -> string -> unit - (* Sets the persistent value with the given name (1st arg) to the - * passed value (2nd arg). The value must be marshalled as string. - *) - -val set_db_loader : string -> (string -> string) -> unit - (* [set_db_loader key loader]: sets a loader for this key, which is called - when set_db has not been set for this key. The arg of the loader is the - key. - *) - -val enable_db_loaders : bool -> unit - (* Whether dynamic loading is enabled *) diff --git a/ocamlnet_lite/netmappings.ml b/ocamlnet_lite/netmappings.ml deleted file mode 100644 index e0f7995..0000000 --- a/ocamlnet_lite/netmappings.ml +++ /dev/null @@ -1,39 +0,0 @@ -(* $Id$ - * ---------------------------------------------------------------------- - * - *) - -type from_uni_list = - | U_nil - | U_single of (int * int) - | U_double of (int * int * int * int) - | U_array of int array - -let to_unicode = Hashtbl.create 50 -let from_unicode = Hashtbl.create 50 - -let get_to_unicode enc_name : int array = - try - let table = - try Hashtbl.find to_unicode enc_name - with Not_found -> - let t_str = Netdb.read_db ("cmapf." ^ enc_name) in - let t = Marshal.from_string t_str 0 in - Hashtbl.add to_unicode enc_name t; - t - in - table - with error -> raise error - -let get_from_unicode enc_name : from_uni_list array = - try - let table = - try Hashtbl.find from_unicode enc_name - with Not_found -> - let t_str = Netdb.read_db ("cmapr." ^ enc_name) in - let t = Marshal.from_string t_str 0 in - Hashtbl.add from_unicode enc_name t; - t - in - table - with error -> raise error diff --git a/ocamlnet_lite/netmappings.mli b/ocamlnet_lite/netmappings.mli deleted file mode 100644 index d89c3d1..0000000 --- a/ocamlnet_lite/netmappings.mli +++ /dev/null @@ -1,46 +0,0 @@ -(* $Id$ - * ---------------------------------------------------------------------- - *) - -(** Internal access to the character conversion database - * - * This is an internal module. - *) - -type from_uni_list = - U_nil - | U_single of (int*int) - | U_double of (int*int * int*int) - | U_array of int array -;; - (* A representation of (int*int) list that is optimized for the case that - * lists with 0 and 1 and 2 elements are the most frequent cases. - *) - - -val get_to_unicode : string -> int array - -val get_from_unicode : string -> from_uni_list array - (* These functions get the conversion tables from local encodings to - * Unicode and vice versa. - * It is normally not necessary to access these tables; the - * Netconversion module does it already for you. - * - * The argument is the internal name of the encoding. (E.g. if - * encoding = `Enc_iso88591, the internal name is "iso88591", i.e. - * the "`Enc_" prefix is removed. However, for "composite encodings" - * like `Enc_eucjp things are more complicated.) - * - * Specification of the conversion tables: - * - * to_unicode: maps a local code to Unicode, i.e. - * let m = Hashtbl.find `Enc_isoXXX to_unicode in - * let unicode = m.(isocode) - * - This may be (-1) to indicate that the code point is not defined. - * - * from_unicode: maps Unicode to a local code, i.e. - * let m = Hashtbl.find `Enc_isoXXX from_unicode in - * let l = m.(unicode land mask) - * Now search in l the pair (unicode, isocode), and return isocode. - * Where mask = Array.length from_unicode - 1 - *) From 35961b1b914734f56d69a2f0e85a876621104f1f Mon Sep 17 00:00:00 2001 From: Javier Chavarri Date: Tue, 12 Dec 2023 14:45:29 +0000 Subject: [PATCH 8/8] add back malformed_code exn --- ocamlnet_lite/netconversion.mli | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/ocamlnet_lite/netconversion.mli b/ocamlnet_lite/netconversion.mli index 96f6227..615e4b7 100644 --- a/ocamlnet_lite/netconversion.mli +++ b/ocamlnet_lite/netconversion.mli @@ -263,6 +263,14 @@ * - A [uarray] is an [array of int] representing a string *) +exception Malformed_code +(** Raised when an illegal byte sequence is found *) + +exception Cannot_represent of int +(** Raised when a certain Unicode code point cannot be represented in + * the selected output encoding + *) + type encoding = [ `Enc_utf8 (* UTF-8 *) | (* Encoding subset: *)