From a7a9cd6adb6b121369fdb32b425a2c2f46194791 Mon Sep 17 00:00:00 2001 From: Anton Lavrik Date: Sun, 18 Aug 2019 21:17:50 +0100 Subject: [PATCH] Use sedlex instead ulex Sedlex is a successor of ulex based ppx. A side effect of this change is that now we use opam for installing third-party dependencies. --- .travis-ci.sh | 22 +--- INSTALL | 51 ++------- Makefile | 7 +- configure | 27 ++--- deps/Makefile | 4 - deps/ulex-camlp5/META | 6 -- deps/ulex-camlp5/Makefile | 40 ------- deps/ulex-camlp5/Makefile.camlp5 | 16 --- deps/ulex-camlp5/Makefile.download | 38 ------- opam | 2 +- piqilib/META.in | 2 +- piqilib/Makefile | 4 +- piqilib/piq_lexer.ml | 113 ++++++++++--------- piqilib/piqi_json_parser.mll | 10 +- piqilib/piqi_utf8.ml | 168 +++++++++++++++++++++++++++++ src/Makefile | 2 +- src/of_proto.ml | 30 +++--- src/piqi_http.ml | 79 +++++++------- 18 files changed, 327 insertions(+), 294 deletions(-) delete mode 100644 deps/ulex-camlp5/META delete mode 100644 deps/ulex-camlp5/Makefile delete mode 100644 deps/ulex-camlp5/Makefile.camlp5 delete mode 100644 deps/ulex-camlp5/Makefile.download create mode 100644 piqilib/piqi_utf8.ml diff --git a/.travis-ci.sh b/.travis-ci.sh index 1bffe018..bbcd60e3 100755 --- a/.travis-ci.sh +++ b/.travis-ci.sh @@ -22,7 +22,7 @@ else brew upgrade python # build dependencies (Mac OS X) - brew install ocaml opam + brew install opam # optional dependencies for running tests brew install protobuf @@ -39,36 +39,16 @@ else opam init eval `opam config env` - # install basic build dependencies using opam - opam install ocamlfind camlp5 - # so that tests pass with the latest protobuf version export CXXFLAGS='-std=c++14' - elif [ "$OCAML_VERSION" = "system" ] - then - # build dependencies (Ubuntu) - sudo apt-get install ocaml-nox camlp5 ocaml-findlib - - # optional dependencies for running tests and building docs - # - # NOTE: these need to be tested only once, no need to re-run these - # tests for all OCaml versions - sudo apt-get install libprotoc-dev protobuf-compiler pandoc - - echo "system OCaml version:" - ocaml -version else # install specific ocaml version wget https://raw.githubusercontent.com/ocaml/ocaml-ci-scripts/master/.travis-ocaml.sh . .travis-ocaml.sh - - # install basic build dependencies using opam - opam install ocamlfind camlp5 fi - ./configure make deps make diff --git a/INSTALL b/INSTALL index 4f75c7bb..6f0cf5c8 100644 --- a/INSTALL +++ b/INSTALL @@ -19,59 +19,30 @@ architectures supported by OCaml. See the list of supported platforms at PREREQUISITES ============= -On Debian and Ubuntu: +1. Install OCaml and opam (package manager for OCaml) - apt-get install ocaml camlp5 ocaml-findlib gcc make sed +On Debian and Ubuntu: + apt-get install opam On Mac OS X (using Homebrew): brew install ocaml opam - opam init - eval `opam config env` - opam install ocamlfind camlp5 - - (Here, we used [OPAM](http://opam.ocaml.org/) to build and install - ocamlfind). - - -On Windows: - - See the "CROSS-COMPILATION" section below for instructions on building a - Windows executable on Linux and Mac OS X. - For building native Windows binaries, download and install OCaml for - Windows from here: +On other systems, see [OPAM](http://opam.ocaml.org/) - http://protz.github.io/ocaml-installer/ +To cross-compile for Windows, see the "CROSS-COMPILATION" section below - For building under Cygwin, follow the steps from the next section. +2. Configure opam -On other Unix systems: - - * OCaml (>= 4.03.0) - - OCaml is available in most Linux distributions. It can also be - downloaded from here: - - http://caml.inria.fr/download.en.html - - * Camlp5 -- Caml Preprocessor and Pretty Printer - - * Ocamlfind/Findlib -- library manager for OCaml - - If installed separately form prepackaged OCaml distribution, it - is recommended to build it from sources after you build/install - OCaml. It can be downloaded from here: + opam init + eval `opam config env` - http://projects.camlcity.org/projects/findlib.html - * C compiler - GCC works. Other compilers should work if they are able to build - OCaml. +3. Install dependencies - * GNU make, sed and probably some other tools. + opam install --deps-only ./opam INSTALLATION INSTRUCTIONS @@ -81,7 +52,7 @@ INSTALLATION INSTRUCTIONS Run "./configure --help" for the list of available options. -2. Build third-party dependencies +2. Install third-party dependencies make deps diff --git a/Makefile b/Makefile index 33080b7d..777ba78b 100644 --- a/Makefile +++ b/Makefile @@ -24,9 +24,10 @@ endif deps: build-dir - $(MAKE) -C deps download - $(MAKE) -C deps - $(MAKE) -C deps uninstall install + @#$(MAKE) -C deps download + @#$(MAKE) -C deps + @#$(MAKE) -C deps uninstall install + opam install -y --deps-only ./opam build-dir: diff --git a/configure b/configure index 4ad06d82..c6d470d7 100755 --- a/configure +++ b/configure @@ -101,6 +101,20 @@ then # skipping everything else in cross-compilation mode; namely, we have to # cross-compile deps that come with the package echo "export SYSTEM := unix" >> $M + + # figure out which dependencies we need to build + echo "checking whether necessary dependencies are already installed..." + for i in xmlm easy-format base64 + do + dir="`$OCAMLFIND query $i 2>/dev/null`" + if [ $? -eq 0 ] + then + echo "$i is installed in $dir" + echo "SKIP-$i = 1" >> $M + else + echo "$i is not installed; it will be built during \"make deps\"" + fi + done else # detecting the type of OCaml toolchain system="`ocamlc -config 2>/dev/null | grep system | sed 's/system: //'`" @@ -109,19 +123,6 @@ else echo "detected $system OCaml toolchain" fi -# figure out which dependencies we need to build -echo "checking whether necessary dependencies are already installed..." -for i in xmlm ulex-camlp5 easy-format base64 -do - dir="`$OCAMLFIND query $i 2>/dev/null`" - if [ $? -eq 0 ] - then - echo "$i is installed in $dir" - echo "SKIP-$i = 1" >> $M - else - echo "$i is not installed; it will be built during \"make deps\"" - fi -done OCAML_VERSION=`$OCAMLFIND ocamlc -version` if [ $? -ne 0 ] diff --git a/deps/Makefile b/deps/Makefile index ce40a70c..ea39457c 100644 --- a/deps/Makefile +++ b/deps/Makefile @@ -16,10 +16,6 @@ endif export OCAMLPATH -ifndef SKIP-ulex -DIRS += ulex-camlp5 -endif - ifndef SKIP-xmlm DIRS += xmlm endif diff --git a/deps/ulex-camlp5/META b/deps/ulex-camlp5/META deleted file mode 100644 index 2fb35716..00000000 --- a/deps/ulex-camlp5/META +++ /dev/null @@ -1,6 +0,0 @@ -version = "1.1" -description = "Runtime support for ulex" -archive(byte) = "ulexing.cma" -archive(native) = "ulexing.cmxa" -archive(syntax,toploop) = "pa_ulex.cma ulexing.cma" -archive(syntax,preprocessor) = "pa_ulex.cma" diff --git a/deps/ulex-camlp5/Makefile b/deps/ulex-camlp5/Makefile deleted file mode 100644 index 9e9a1f91..00000000 --- a/deps/ulex-camlp5/Makefile +++ /dev/null @@ -1,40 +0,0 @@ -OCAMLMAKEFILE := ../../make/OCamlMakefile - - -RESULT = ulexing - -SOURCES = cset.ml utf8.ml ulexing.ml utf16.ml ulex.ml - - -OCAMLFIND_PACKAGE = ulex-camlp5 - -MODS = ulexing utf16 utf8 -LIBINSTALL_FILES = \ - $(wildcard $(MODS:=.mli) $(MODS:=.cmi) $(MODS:=.cmx) pa_ulex.cma ulexing.a ulexing.cma ulexing.cmxa) - -PACKS = camlp5 - - -all: bcl ncl pa_ulex.cma - - -install: libinstall - - -uninstall: libuninstall - - -pa_ulex.cma: - $(MAKE) -f Makefile.camlp5 - - -download: - $(MAKE) -f Makefile.download - - -clean:: - $(MAKE) -f Makefile.camlp5 clean - $(MAKE) -f Makefile.download clean - - -include $(OCAMLMAKEFILE) diff --git a/deps/ulex-camlp5/Makefile.camlp5 b/deps/ulex-camlp5/Makefile.camlp5 deleted file mode 100644 index 317b81c1..00000000 --- a/deps/ulex-camlp5/Makefile.camlp5 +++ /dev/null @@ -1,16 +0,0 @@ -OCAMLMAKEFILE := ../../make/OCamlMakefile -unexport OCAMLFIND_TOOLCHAIN - - -RESULT = pa_ulex - -SOURCES = cset.ml utf8.ml ulexing.ml utf16.ml ulex.ml pa_ulex.ml - - -INCDIRS = +camlp5 - - -all: bcl - - -include $(OCAMLMAKEFILE) diff --git a/deps/ulex-camlp5/Makefile.download b/deps/ulex-camlp5/Makefile.download deleted file mode 100644 index ab55db3b..00000000 --- a/deps/ulex-camlp5/Makefile.download +++ /dev/null @@ -1,38 +0,0 @@ -TARBALL := v1.2-camlp5.tar.gz -URL := https://github.com/sacerdot/ulex/archive/$(TARBALL) -MD5 := 60aeff1073fe9bbb1be46b2da3a7618a - - -UPSTREAM_FILES := \ - cset.ml ulexing.ml ulexing.mli ulex.ml ulex.mli utf8.ml utf8.mli utf16.ml utf16.mli \ - pa_ulex.ml - - -UPSTREAM_DIR := upstream - - -all: $(TARBALL) $(UPSTREAM_DIR) $(UPSTREAM_FILES) - - -$(UPSTREAM_FILES): %: $(UPSTREAM_DIR)/% - cp $< $@ - - -pa_ulex.ml: %: $(UPSTREAM_DIR)/% - echo "(*pp camlp5o pa_macro.cmo pa_extend.cmo q_MLast.cmo *)" > $@ - cat $< >> $@ - - -$(UPSTREAM_DIR): $(TARBALL) - mkdir -p $(UPSTREAM_DIR) - tar -xzf $< -C $(UPSTREAM_DIR) --strip-components=1 - - -$(TARBALL): - ../../make/http-download $(URL) - test $(MD5) = `../../make/md5sum $(TARBALL)` - - -clean: - rm -rf $(TARBALL) $(UPSTREAM_DIR) $(UPSTREAM_FILES) - diff --git a/opam b/opam index 17838473..afc107e1 100644 --- a/opam +++ b/opam @@ -24,7 +24,7 @@ depends: [ "ocaml" {>= "4.03"} "ocamlfind" {build} "easy-format" - "ulex-camlp5" + "sedlex" "xmlm" "base64" {>="3.1.0"} ] diff --git a/piqilib/META.in b/piqilib/META.in index 30043603..8e2c0b16 100644 --- a/piqilib/META.in +++ b/piqilib/META.in @@ -1,4 +1,4 @@ description = "The Piqi library -- runtime support for multi-format Protobuf/JSON/XML/Piq data serialization and conversion" -requires = "ulex-camlp5 easy-format xmlm base64" +requires = "easy-format xmlm base64 sedlex" archive(byte) = "piqilib.cma" archive(native) = "piqilib.cmxa" diff --git a/piqilib/Makefile b/piqilib/Makefile index 4878ebd5..74066842 100644 --- a/piqilib/Makefile +++ b/piqilib/Makefile @@ -9,8 +9,7 @@ LIBINSTALL_FILES += \ $(wildcard *.cmi) \ -PACKS = ulex-camlp5 easy-format xmlm base64 - +PACKS = easy-format xmlm base64 sedlex sedlex.ppx SOURCES = \ piqi_version.ml \ @@ -39,6 +38,7 @@ SOURCES = \ piqi_protobuf.ml \ piqi_db.ml \ \ + piqi_utf8.ml \ piq_lexer.ml \ piq_parser.ml \ piq_gen.ml \ diff --git a/piqilib/piq_lexer.ml b/piqilib/piq_lexer.ml index c0f8156b..969f81d5 100644 --- a/piqilib/piq_lexer.ml +++ b/piqilib/piq_lexer.ml @@ -1,4 +1,3 @@ -(*pp camlp5o -I `ocamlfind query ulex-camlp5` pa_ulex.cma *) (* Copyright 2009, 2010, 2011, 2012, 2013, 2014, 2015, 2017 Anton Lavrik @@ -19,6 +18,9 @@ module C = Piqi_common open C.Std +module Utf8 = Piqi_utf8 + + exception Error0 of string (* internally used error exception *) @@ -87,9 +89,9 @@ let type_of_char c = else String_b -let regexp digit = ['0'-'9'] -let regexp odigit = ['0'-'7'] -let regexp xdigit = ['0'-'9''a'-'f''A'-'F'] +let digit = [%sedlex.regexp? '0'..'9'] +let odigit = [%sedlex.regexp? '0'..'7'] +let xdigit = [%sedlex.regexp? '0'..'9' | 'a'..'f' | 'A'..'F'] let make_char c = @@ -98,13 +100,12 @@ let make_char c = let escaped_lexeme lexbuf = (* strip the first symbol *) - let start = Ulexing.get_start lexbuf in - let pos = Ulexing.get_pos lexbuf in - Ulexing.utf8_sub_lexeme lexbuf 1 (pos - start - 1) + let len = Sedlexing.lexeme_length lexbuf in + Sedlexing.Utf8.sub_lexeme lexbuf 1 (len - 1) (* XXX: add support for a b f v escapes? *) -let parse_string_escape = lexer +let parse_string_escape lexbuf = [%sedlex match lexbuf with | '\\' -> make_char '\\' | '"' -> make_char '"' | 't' -> make_char '\t' @@ -113,26 +114,29 @@ let parse_string_escape = lexer (* XXX: disable it for now, since specifying decimals this way may make more * sense: | odigit odigit odigit -> - let c = int_of_ostring (Ulexing.utf8_lexeme lexbuf) in + let c = int_of_ostring (Sedlexing.Utf8.lexeme lexbuf) in (type_of_char c),c *) - | "x" xdigit xdigit -> + | 'x', xdigit, xdigit -> let c = int_of_xstring (escaped_lexeme lexbuf) in (type_of_char c),c - | 'u' xdigit xdigit xdigit xdigit -> + | 'u', xdigit, xdigit, xdigit, xdigit -> let c = int_of_xstring (escaped_lexeme lexbuf) in String_u,c - | 'U' xdigit xdigit xdigit xdigit xdigit xdigit xdigit xdigit -> + | 'U', xdigit, xdigit, xdigit, xdigit, xdigit, xdigit, xdigit, xdigit -> (* XXX: check code validity so that it doesn't exeed allocated limit *) let c = int_of_xstring (escaped_lexeme lexbuf) in String_u,c - | _ -> error "invalid string escape literal" + | _ -> + let s = Sedlexing.Utf8.lexeme lexbuf in + error ("invalid string escape literal " ^ s) +] (* returns the list of integers representing codepoints *) (* XXX: allow only printable characters in strings? *) (* XXX: provide a method for wraping a string to several lines? *) -let rec parse_string_literal ltype l = lexer +let rec parse_string_literal ltype l lexbuf = [%sedlex match lexbuf with | '\\' -> let ctype, c = parse_string_escape lexbuf in let ltype = @@ -146,14 +150,15 @@ let rec parse_string_literal ltype l = lexer | _,_ -> ltype (* leave the previous type *) in parse_string_literal ltype (c::l) lexbuf - | [0-0x1F 127] -> (* XXX: what about unicode non-printable chars? *) + | (0 .. 0x1F) | 127 -> (* XXX: what about unicode non-printable chars? *) (* do not allow non-printables to appear in string literals -- one * should use correspondent escaped specifications instead *) error "invalid string literal" | eof -> ltype,(List.rev l) - | _ -> - let c = Ulexing.lexeme_char lexbuf 0 in + | any -> + let c = Sedlexing.lexeme_char lexbuf 0 in + let c = Uchar.to_int c in let ltype = match ltype with String_b when c > 127 -> error "invalid string literal" @@ -161,6 +166,11 @@ let rec parse_string_literal ltype l = lexer | _ -> ltype in parse_string_literal ltype (c::l) lexbuf + | _ -> + let c = Sedlexing.lexeme_char lexbuf 0 in + let c = Uchar.to_int c in + error ("invalid string codepoint " ^ string_of_int c) +] let utf8_of_list l = @@ -181,7 +191,7 @@ let string_of_list l = let parse_string_literal s = - let lexbuf = Ulexing.from_utf8_string s in + let lexbuf = Sedlexing.Utf8.from_string s in let str_type, l = parse_string_literal String_a [] lexbuf in let parsed_str = match str_type with @@ -260,29 +270,27 @@ type token = | Raw_string of string -let regexp newline = ('\n' | "\r\n") -let regexp ws = [' ' '\t']+ +let newline = [%sedlex.regexp? '\n' | "\r\n"] +let ws = [%sedlex.regexp? Plus (' ' | '\t')] +let name = [%sedlex.regexp? (':' | '.'), Plus ('a'..'z' | 'A'..'Z' | '0'..'9' | '-' | '_' | '/' | '.' | ':')] -let regexp name = [':' '.'] ['a'-'z' 'A'-'Z' '0'-'9' '-' '_' '/' '.' ':']+ - +let string_literal = [%sedlex.regexp? '"', Star (Compl '"' | "\\\""), '"'] (* ASCII alphanumeric, '-', '_', '.', '/' for representing numbers and unquoted * strings (useful e.g. as DSL identifiers) * * XXX: include all alphanumeric Unicode? *) -let regexp first_word_char = ['a'-'z' 'A'-'Z' '0'-'9' '-' '_'] - -let regexp word_char = (first_word_char | '.' | '/') +let first_word_char = [%sedlex.regexp? ('a'..'z' | 'A'..'Z' | '0'..'9' | '-' | '_')] -let regexp word = first_word_char word_char * +let word_char = [%sedlex.regexp? (first_word_char | '.' | '/')] +let word = [%sedlex.regexp? first_word_char, Star (word_char)] -let regexp float_literal = - ['0'-'9'] ['0'-'9' '_']* - ('.' ['0'-'9' '_']* )? - (['e' 'E'] ['+' '-']? ['0'-'9'] ['0'-'9' '_']* )? - +let float_literal = + [%sedlex.regexp? ('0'..'9'), Star ('0'..'9' | '_'), + Opt ('.', Star ('0'..'9' | '_')), + Opt (('e' | 'E'), Opt ('+' | '-'), '0'..'9', Star ('0'..'9' | '_'))] let is_valid_first_word_char = function | 'a'..'z' | 'A'..'Z' | '0'..'9' | '-' | '_' -> true @@ -292,8 +300,7 @@ let is_valid_word_char = function | '.' | '/' -> true | x -> is_valid_first_word_char x - -(* accepts the same language as the regexp above *) +(* accepts the same language as the word regexp above *) let is_valid_word s = let len = String.length s in (* NOTE: it works transparently on utf8 strings *) @@ -314,7 +321,7 @@ let is_valid_word s = type buf = { - lexbuf : Ulexing.lexbuf; + lexbuf : Sedlexing.lexbuf; mutable lcount : int; (* line counter *) mutable lstart : int; (* buffer position of the latest line *) @@ -336,12 +343,12 @@ let make_buf lexbuf = let update_line_counter buf = buf.lcount <- buf.lcount + 1; - buf.lstart <- Ulexing.lexeme_end buf.lexbuf + buf.lstart <- Sedlexing.lexeme_end buf.lexbuf let get_column buf = (* NOTE: ennumerating columns from 1 *) - (Ulexing.lexeme_start buf.lexbuf) - buf.lstart + 1 + (Sedlexing.lexeme_start buf.lexbuf) - buf.lstart + 1 let update_column buf = @@ -358,7 +365,7 @@ let location buf = buf.lcount, buf.col -let rec token0 buf = lexer +let rec token0 buf lexbuf = [%sedlex match lexbuf with | newline -> (* update line counter, drop column counter and move on *) update_line_counter buf; @@ -369,16 +376,16 @@ let rec token0 buf = lexer error "invalid character" | "%%" -> error "'%%' literal is reserved for future versions" - | '%' ( [^'%' '\n'] [^'\n']* )? newline? -> (* skip single line comment *) + | '%', Opt ( Compl ('%' | '\n'), Star (Compl '\n') ), Opt newline -> (* skip single line comment *) update_line_counter buf; token0 buf lexbuf - | '#' newline? -> (* verbatim empty text *) + | '#', Opt newline -> (* verbatim empty text *) Text "" - | '#' [^' '] -> + | '#', Compl ' ' -> error "space is expected after '#'" - | '#' [' '] [^'\n']* newline? -> (* verbatim text *) + | '#', ' ', Star (Compl '\n'), Opt newline -> (* verbatim text *) (* TODO: restrict string literal to contain only printable characters *) - let s = Ulexing.utf8_lexeme lexbuf in + let s = Sedlexing.Utf8.lexeme lexbuf in let len = String.length s in if len = 0 then Text "" @@ -399,8 +406,8 @@ let rec token0 buf = lexer | ']' -> Rbr | '*' -> Star | ',' -> Comma - | '"'([^'"']|"\\\"")*'"' -> (* string literal *) - let s = Ulexing.utf8_lexeme lexbuf in + | string_literal -> + let s = Sedlexing.Utf8.lexeme lexbuf in let s = String.sub s 1 (String.length s - 2) in (* cut double-quotes *) let (str_type, parsed_str) = parse_string_literal s in @@ -408,10 +415,10 @@ let rec token0 buf = lexer | '"' -> error "string literal overrun" | name -> - let s = Ulexing.utf8_lexeme lexbuf in + let s = Sedlexing.Utf8.lexeme lexbuf in Name s | word -> - let s = Ulexing.utf8_lexeme lexbuf in + let s = Sedlexing.Utf8.lexeme lexbuf in Word s (* TODO: this is inconsistent - here, integers qualify as words automatically, @@ -420,11 +427,12 @@ let rec token0 buf = lexer * lexing error *) | float_literal -> - let s = Ulexing.utf8_lexeme lexbuf in + let s = Sedlexing.Utf8.lexeme lexbuf in Word s | eof -> EOF | _ -> error "invalid character" +] (* error reporter *) @@ -444,10 +452,9 @@ let token1 buf = tok with | Error0 s -> error buf s - | Ulexing.Error -> error buf "lexing internal error" - | Ulexing.InvalidCodepoint i -> + | Sedlexing.InvalidCodepoint i -> error buf ("invalid unicode code point " ^ string_of_int i) - | Utf8.MalFormed -> + | Sedlexing.MalFormed -> error buf "malformed utf-8" @@ -465,17 +472,17 @@ let token buf = let init_from_string s = - let lexbuf = Ulexing.from_utf8_string s in + let lexbuf = Sedlexing.Utf8.from_string s in make_buf lexbuf let init_from_stream s = - let lexbuf = Ulexing.from_utf8_stream s in + let lexbuf = Sedlexing.Utf8.from_stream s in make_buf lexbuf let init_from_channel ch = - let lexbuf = Ulexing.from_utf8_channel ch in + let lexbuf = Sedlexing.Utf8.from_channel ch in make_buf lexbuf diff --git a/piqilib/piqi_json_parser.mll b/piqilib/piqi_json_parser.mll index b8afe935..255fcf37 100644 --- a/piqilib/piqi_json_parser.mll +++ b/piqilib/piqi_json_parser.mll @@ -164,13 +164,13 @@ THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. if i = pos + bytes then n else begin - let w = Utf8.width.(Char.code s.[i]) in + let w = Piqi_utf8.width.(Char.code s.[i]) in if w > 0 && i + w <= pos + bytes then ( (* check if the next unicode char is correctly encoded in utf8 *) - ignore (Utf8.next s i); + ignore (Piqi_utf8.next s i); aux (succ n) (i + w) - ) else raise Utf8.MalFormed + ) else raise Piqi_utf8.MalFormed end in aux 0 pos @@ -179,7 +179,7 @@ THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. let check_adjust_utf8 v lexbuf s start len = let utf8_len = try utf8_length s start len - with Utf8.MalFormed -> + with Piqi_utf8.MalFormed -> custom_error "Invalid utf-8 sequence" v lexbuf in v.utf8_delta <- v.utf8_delta + (len - utf8_len) @@ -345,7 +345,7 @@ and finish_escaped_char v = parse | 't' { Buffer.add_char v.buf '\t' } | 'u' (( hex hex hex hex ) as s) { let i = Piq_lexer.int_of_xstring s in - Utf8.store v.buf i } + Piqi_utf8.store v.buf i } | _ { lexer_error "Invalid escape sequence" v lexbuf } | eof { custom_error "Unexpected end of input" v lexbuf } diff --git a/piqilib/piqi_utf8.ml b/piqilib/piqi_utf8.ml new file mode 100644 index 00000000..c2aeee8a --- /dev/null +++ b/piqilib/piqi_utf8.ml @@ -0,0 +1,168 @@ +(* + +This is a copy of ulex/utf8.ml from the ulex package. + + +The package ulex is released under the terms of an MIT-like license. + +Copyright 2005 by Alain Frisch. + +Permission is hereby granted, free of charge, to any person obtaining +a copy of this software and associated documentation files (the +"Software"), to deal in the Software without restriction, including +without limitation the rights to use, copy, modify, merge, publish, +distribute, sublicense, and/or sell copies of the Software, and to +permit persons to whom the Software is furnished to do so, subject to +the following conditions: + +The above copyright notice and this permission notice shall be +included in all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE +LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION +OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION +WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*) + +exception MalFormed + +(* cf http://www.faqs.org/rfcs/rfc3629.html *) + +let width = Array.make 256 (-1) +let () = + for i = 0 to 127 do width.(i) <- 1 done; + for i = 192 to 223 do width.(i) <- 2 done; + for i = 224 to 239 do width.(i) <- 3 done; + for i = 240 to 247 do width.(i) <- 4 done + +let next s i = + match s.[i] with + | '\000'..'\127' as c -> + Char.code c + | '\192'..'\223' as c -> + let n1 = Char.code c in + let n2 = Char.code s.[i+1] in + if (n2 lsr 6 != 0b10) then raise MalFormed; + ((n1 land 0x1f) lsl 6) lor (n2 land 0x3f) + | '\224'..'\239' as c -> + let n1 = Char.code c in + let n2 = Char.code s.[i+1] in + let n3 = Char.code s.[i+2] in + if (n2 lsr 6 != 0b10) || (n3 lsr 6 != 0b10) then raise MalFormed; + let p = + ((n1 land 0x0f) lsl 12) lor ((n2 land 0x3f) lsl 6) lor (n3 land 0x3f) + in + if (p >= 0xd800) && (p <= 0xdf00) then raise MalFormed; + p + | '\240'..'\247' as c -> + let n1 = Char.code c in + let n2 = Char.code s.[i+1] in + let n3 = Char.code s.[i+2] in + let n4 = Char.code s.[i+3] in + if (n2 lsr 6 != 0b10) || (n3 lsr 6 != 0b10) || (n4 lsr 6 != 0b10) + then raise MalFormed; + ((n1 land 0x07) lsl 18) lor ((n2 land 0x3f) lsl 12) lor + ((n3 land 0x3f) lsl 6) lor (n4 land 0x3f) + | _ -> raise MalFormed + + +(* With this implementation, a truncated code point will result + in Stream.Failure, not in MalFormed. *) + +let from_stream s = + match Stream.next s with + | '\000'..'\127' as c -> + Char.code c + | '\192'..'\223' as c -> + let n1 = Char.code c in + let n2 = Char.code (Stream.next s) in + if (n2 lsr 6 != 0b10) then raise MalFormed; + ((n1 land 0x1f) lsl 6) lor (n2 land 0x3f) + | '\224'..'\239' as c -> + let n1 = Char.code c in + let n2 = Char.code (Stream.next s) in + let n3 = Char.code (Stream.next s) in + if (n2 lsr 6 != 0b10) || (n3 lsr 6 != 0b10) then raise MalFormed; + ((n1 land 0x0f) lsl 12) lor ((n2 land 0x3f) lsl 6) lor (n3 land 0x3f) + | '\240'..'\247' as c -> + let n1 = Char.code c in + let n2 = Char.code (Stream.next s) in + let n3 = Char.code (Stream.next s) in + let n4 = Char.code (Stream.next s) in + if (n2 lsr 6 != 0b10) || (n3 lsr 6 != 0b10) || (n4 lsr 6 != 0b10) + then raise MalFormed; + ((n1 land 0x07) lsl 18) lor ((n2 land 0x3f) lsl 12) lor + ((n3 land 0x3f) lsl 6) lor (n4 land 0x3f) + | _ -> raise MalFormed + + + +let compute_len s pos bytes = + let rec aux n i = + if i >= pos + bytes then if i = pos + bytes then n else raise MalFormed + else + let w = width.(Char.code s.[i]) in + if w > 0 then aux (succ n) (i + w) + else raise MalFormed + in + aux 0 pos + +let rec blit_to_int s spos a apos n = + if n > 0 then begin + a.(apos) <- next s spos; + blit_to_int s (spos + width.(Char.code s.[spos])) a (succ apos) (pred n) + end + +let to_int_array s pos bytes = + let n = compute_len s pos bytes in + let a = Array.make n 0 in + blit_to_int s pos a 0 n; + a + +(**************************) + +let width_code_point p = + if p <= 0x7f then 1 + else if p <= 0x7ff then 2 + else if p <= 0xffff then 3 + else if p <= 0x10ffff then 4 + else raise MalFormed + +let store b p = + if p <= 0x7f then + Buffer.add_char b (Char.chr p) + else if p <= 0x7ff then ( + Buffer.add_char b (Char.chr (0xc0 lor (p lsr 6))); + Buffer.add_char b (Char.chr (0x80 lor (p land 0x3f))) + ) + else if p <= 0xffff then ( + if (p >= 0xd800 && p < 0xe000) then raise MalFormed; + Buffer.add_char b (Char.chr (0xe0 lor (p lsr 12))); + Buffer.add_char b (Char.chr (0x80 lor ((p lsr 6) land 0x3f))); + Buffer.add_char b (Char.chr (0x80 lor (p land 0x3f))) + ) + else if p <= 0x10ffff then ( + Buffer.add_char b (Char.chr (0xf0 lor (p lsr 18))); + Buffer.add_char b (Char.chr (0x80 lor ((p lsr 12) land 0x3f))); + Buffer.add_char b (Char.chr (0x80 lor ((p lsr 6) land 0x3f))); + Buffer.add_char b (Char.chr (0x80 lor (p land 0x3f))) + ) + else raise MalFormed + + +let from_int_array a apos len = + let b = Buffer.create (len * 4) in + let rec aux apos len = + if len > 0 then (store b a.(apos); aux (succ apos) (pred len)) + else Buffer.contents b in + aux apos len + +let stream_from_char_stream s = + Stream.from + (fun _ -> + try Some (from_stream s) + with Stream.Failure -> None) + diff --git a/src/Makefile b/src/Makefile index d5f9b159..ba63399d 100644 --- a/src/Makefile +++ b/src/Makefile @@ -12,7 +12,7 @@ PACKS = unix # OCAMLPATH to reduce complexity of the windows build (e.g. avoid converting : # path separators to ; along with reverting slashes); we can't simply symlink it # to the build dir either because symlinks don't work on windows -PACKS += ulex-camlp5 easy-format xmlm base64 +PACKS += easy-format xmlm base64 sedlex sedlex.ppx INCDIRS = ../piqilib LIBS = ../piqilib/piqilib diff --git a/src/of_proto.ml b/src/of_proto.ml index ea5f8e1e..d81c29cc 100644 --- a/src/of_proto.ml +++ b/src/of_proto.ml @@ -1,4 +1,3 @@ -(*pp camlp5o -I `ocamlfind query ulex-camlp5` pa_ulex.cma *) (* Copyright 2009, 2010, 2011, 2012, 2013, 2014, 2015, 2017 Anton Lavrik @@ -244,12 +243,12 @@ Reference implementation from protobuf-2.3.0/src/google/protobuf/stubs/strutil.c *) -let regexp digit = ['0'-'9'] -let regexp odigit = ['0'-'7'] -let regexp xdigit = ['0'-'9''a'-'f''A'-'F'] +let digit = [%sedlex.regexp? '0'..'9'] +let odigit = [%sedlex.regexp? '0'..'7'] +let xdigit = [%sedlex.regexp? '0'..'9' | 'a'..'f' | 'A'..'F'] -let parse_string_escape = lexer +let parse_string_escape lexbuf = [%sedlex match lexbuf with | 'a' -> '\007' | 'b' -> '\008' | 'f' -> '\012' @@ -261,34 +260,37 @@ let parse_string_escape = lexer | '?' -> '?' | '\'' -> '\'' | '"' -> '"' - | odigit odigit odigit -> - let v = Ulexing.latin1_lexeme lexbuf in + | odigit, odigit, odigit -> + let v = Sedlexing.Latin1.lexeme lexbuf in Char.chr (Piq_lexer.int_of_ostring v) - | ("x" | "X") xdigit xdigit -> - let v = Ulexing.latin1_sub_lexeme lexbuf 1 2 - in + | ("x" | "X"), xdigit, xdigit -> + let v = Sedlexing.Latin1.sub_lexeme lexbuf 1 2 in Char.chr (Piq_lexer.int_of_xstring v) | _ -> piqi_error "error: invalid string escape literal in .proto" +] (* returns the list of integers representing codepoints *) -let rec parse_string_literal buf = lexer +let rec parse_string_literal buf lexbuf = [%sedlex match lexbuf with | '\\' -> let c = parse_string_escape lexbuf in Buffer.add_char buf c; parse_string_literal buf lexbuf | eof -> Buffer.contents buf - | _ -> - let c = Ulexing.latin1_lexeme_char lexbuf 0 in + | any -> + let c = Sedlexing.Latin1.lexeme_char lexbuf 0 in Buffer.add_char buf c; parse_string_literal buf lexbuf + | _ -> + piqi_error "invalid string character in .proto" +] let parse_string_literal s = let buf = Buffer.create (String.length s) in - let lexbuf = Ulexing.from_latin1_string s in + let lexbuf = Sedlexing.Latin1.from_string s in parse_string_literal buf lexbuf diff --git a/src/piqi_http.ml b/src/piqi_http.ml index a3fa9672..d83e04ad 100644 --- a/src/piqi_http.ml +++ b/src/piqi_http.ml @@ -1,4 +1,3 @@ -(*pp camlp5o -I `ocamlfind query ulex-camlp5` pa_ulex.cma *) (* Copyright 2009, 2010, 2011, 2012, 2013, 2014, 2015, 2017, 2018 Anton Lavrik @@ -27,13 +26,13 @@ * definitions from HTTP/1.1 standard *) -let regexp cr = '\r' -let regexp lf = '\n' -let regexp sp = ' ' (* space *) -let regexp ht = '\t' (* horisontal tab *) -let regexp crlf = cr lf +let cr = [%sedlex.regexp? '\r'] +let lf = [%sedlex.regexp? '\n'] +let sp = [%sedlex.regexp? ' ' ] (* space *) +let ht = [%sedlex.regexp? '\t'] (* horisontal tab *) +let crlf = [%sedlex.regexp? cr, lf] -let regexp digit = ['0'-'9'] +let digit = [%sedlex.regexp? '0'..'9'] (* @@ -47,8 +46,8 @@ let regexp digit = ['0'-'9'] TEXT = *) -let regexp lws = crlf? (sp | ht)+ -let regexp text = [^ 0-31 127] | lws +let lws = [%sedlex.regexp? Opt crlf, Plus (sp | ht)] +let text = [%sedlex.regexp? Compl (0 .. 31 | 127) | lws] (* @@ -58,7 +57,7 @@ let regexp text = [^ 0-31 127] | lws | "/" | "[" | "]" | "?" | "=" | "{" | "}" | SP | HT *) -let regexp token = [^ "()<>@,;:\\\"/[]?={} \t" 0-31 127-255 ]+ +let token = [%sedlex.regexp? Plus (Compl ((Chars "()<>@,;:\\\"/[]?={} \t") | 0 .. 31 | 127 .. 255))] (* HTTP message: @@ -75,17 +74,17 @@ let regexp token = [^ "()<>@,;:\\\"/[]?={} \t" 0-31 127-255 ]+ Reason-Phrase = * *) -let regexp http_version = "HTTP" "/" digit+ "." digit+ +let http_version = [%sedlex.regexp? "HTTP", "/", Plus digit, ".", Plus digit] -let regexp status_code = digit digit digit +let status_code = [%sedlex.regexp? digit, digit, digit] (* XXX: what does excluding CR, LF mean? *) -let regexp reason_phrase = [^ 0-31 127 '\r' '\n']* +let reason_phrase = [%sedlex.regexp? Star (Compl (0 .. 31 | 127 | '\r' | '\n'))] -let regexp status_line = http_version sp status_code sp reason_phrase crlf +let status_line = [%sedlex.regexp? http_version, sp, status_code, sp, reason_phrase, crlf] -let regexp status_line_head = http_version sp -let regexp status_line_tail = sp reason_phrase crlf +let status_line_head = [%sedlex.regexp? http_version, sp] +let status_line_tail = [%sedlex.regexp? sp, reason_phrase, crlf] (* HTTP header: @@ -98,11 +97,11 @@ let regexp status_line_tail = sp reason_phrase crlf *) (* NOTE, XXX: this regexp doesn't include quoted-string case, only "*TEXT" *) -let regexp field_value = text* +let field_value = [%sedlex.regexp? Star text] -let regexp message_header = token ":" field_value? +let message_header = [%sedlex.regexp? token, ":", Opt field_value] -let regexp message_header_tail = ":" field_value? +let message_header_tail = [%sedlex.regexp? ":", Opt field_value] (* @@ -125,24 +124,27 @@ let handle_exn context exn = in error (context ^ ": " ^ s) -let parse_status_line_head = lexer +let parse_status_line_head lexbuf = [%sedlex match lexbuf with | status_line_head -> () | eof -> unexpected_eof () | _ -> invalid_character () +] -let parse_status_code = lexer +let parse_status_code lexbuf = [%sedlex match lexbuf with | status_code -> - let s = Ulexing.latin1_lexeme lexbuf in + let s = Sedlexing.Latin1.lexeme lexbuf in int_of_string s | eof -> unexpected_eof () | _ -> invalid_character () +] -let parse_status_line_tail = lexer +let parse_status_line_tail lexbuf = [%sedlex match lexbuf with | status_line_tail -> () | eof -> unexpected_eof () | _ -> invalid_character () +] let parse_status_line lexbuf = @@ -160,32 +162,36 @@ let skip_lexeme_ws lexbuf pos len = if pos = len (* end of lexeme *) then pos else - let i = Ulexing.lexeme_char lexbuf pos in - let c = Char.chr i in - match c with - | ' ' | '\t' -> aux (pos + 1) (* skip ws char *) - | _ -> pos + let c = Sedlexing.lexeme_char lexbuf pos in + let is_ws_char = + let code = Uchar.to_int c in + code = Char.code ' ' || code = Char.code '\t' + in + if is_ws_char + then aux (pos + 1) (* skip ws char *) + else pos in aux pos -let parse_message_header_tail = lexer - | message_header_tail crlf -> +let parse_message_header_tail lexbuf = [%sedlex match lexbuf with + | message_header_tail, crlf -> (* chop crlf at the end of the string (-2) *) - let len = Ulexing.lexeme_length lexbuf - 2 in + let len = Sedlexing.lexeme_length lexbuf - 2 in (* skip ':' character in front of the value *) let pos = 1 in (* skip whitespace in front of the value *) let pos = skip_lexeme_ws lexbuf pos len in - let value = Ulexing.latin1_sub_lexeme lexbuf pos (len - pos) in + let value = Sedlexing.Latin1.sub_lexeme lexbuf pos (len - pos) in value | eof -> unexpected_eof () | _ -> invalid_character () +] -let rec parse_headers accu = lexer +let rec parse_headers accu lexbuf = [%sedlex match lexbuf with | token -> - let field_name = Ulexing.latin1_lexeme lexbuf in + let field_name = Sedlexing.Latin1.lexeme lexbuf in let field_value = parse_message_header_tail lexbuf in (* NOTE: we'll have to live with the compilation warning about * String.lowercase being deprecated: can't use String.lowercase_ascii @@ -198,12 +204,13 @@ let rec parse_headers accu = lexer | eof -> unexpected_eof () | _ -> invalid_character () +] let parse_headers lexbuf = try let headers = parse_headers [] lexbuf in - let body_offset = Ulexing.lexeme_end lexbuf in + let body_offset = Sedlexing.lexeme_end lexbuf in (headers, body_offset) with exn -> handle_exn "error parsing HTTP headers" exn @@ -215,7 +222,7 @@ let parse_response_header buf len = then buf else String.sub buf 0 len in - let lexbuf = Ulexing.from_latin1_string str in + let lexbuf = Sedlexing.Latin1.from_string str in let status_code = parse_status_line lexbuf in let headers, body_offset = parse_headers lexbuf in (status_code, headers, body_offset)