From a7a9cd6adb6b121369fdb32b425a2c2f46194791 Mon Sep 17 00:00:00 2001
From: Anton Lavrik <alavrik@piqi.org>
Date: Sun, 18 Aug 2019 21:17:50 +0100
Subject: [PATCH] Use sedlex instead ulex

Sedlex is a successor of ulex based ppx.

A side effect of this change is that now we use opam for installing
third-party dependencies.
---
 .travis-ci.sh                      |  22 +---
 INSTALL                            |  51 ++-------
 Makefile                           |   7 +-
 configure                          |  27 ++---
 deps/Makefile                      |   4 -
 deps/ulex-camlp5/META              |   6 --
 deps/ulex-camlp5/Makefile          |  40 -------
 deps/ulex-camlp5/Makefile.camlp5   |  16 ---
 deps/ulex-camlp5/Makefile.download |  38 -------
 opam                               |   2 +-
 piqilib/META.in                    |   2 +-
 piqilib/Makefile                   |   4 +-
 piqilib/piq_lexer.ml               | 113 ++++++++++---------
 piqilib/piqi_json_parser.mll       |  10 +-
 piqilib/piqi_utf8.ml               | 168 +++++++++++++++++++++++++++++
 src/Makefile                       |   2 +-
 src/of_proto.ml                    |  30 +++---
 src/piqi_http.ml                   |  79 +++++++-------
 18 files changed, 327 insertions(+), 294 deletions(-)
 delete mode 100644 deps/ulex-camlp5/META
 delete mode 100644 deps/ulex-camlp5/Makefile
 delete mode 100644 deps/ulex-camlp5/Makefile.camlp5
 delete mode 100644 deps/ulex-camlp5/Makefile.download
 create mode 100644 piqilib/piqi_utf8.ml

diff --git a/.travis-ci.sh b/.travis-ci.sh
index 1bffe018..bbcd60e3 100755
--- a/.travis-ci.sh
+++ b/.travis-ci.sh
@@ -22,7 +22,7 @@ else
         brew upgrade python
 
         # build dependencies (Mac OS X)
-        brew install ocaml opam
+        brew install opam
 
         # optional dependencies for running tests
         brew install protobuf
@@ -39,36 +39,16 @@ else
         opam init
         eval `opam config env`
 
-        # install basic build dependencies using opam
-        opam install ocamlfind camlp5
-
         # so that tests pass with the latest protobuf version
         export CXXFLAGS='-std=c++14'
 
-    elif [ "$OCAML_VERSION" = "system" ]
-    then
-        # build dependencies (Ubuntu)
-        sudo apt-get install ocaml-nox camlp5 ocaml-findlib
-
-        # optional dependencies for running tests and building docs
-        #
-        # NOTE: these need to be tested only once, no need to re-run these
-        # tests for all OCaml versions
-        sudo apt-get install libprotoc-dev protobuf-compiler pandoc
-
-        echo "system OCaml version:"
-        ocaml -version
     else
         # install specific ocaml version
         wget https://raw.githubusercontent.com/ocaml/ocaml-ci-scripts/master/.travis-ocaml.sh
 
         . .travis-ocaml.sh
-
-        # install basic build dependencies using opam
-        opam install ocamlfind camlp5
     fi
 
-
     ./configure
     make deps
     make
diff --git a/INSTALL b/INSTALL
index 4f75c7bb..6f0cf5c8 100644
--- a/INSTALL
+++ b/INSTALL
@@ -19,59 +19,30 @@ architectures supported by OCaml. See the list of supported platforms at
 PREREQUISITES
 =============
 
-On Debian and Ubuntu:
+1. Install OCaml and opam (package manager for OCaml)
 
-        apt-get install ocaml camlp5 ocaml-findlib gcc make sed
+On Debian and Ubuntu:
 
+        apt-get install opam
 
 On Mac OS X (using Homebrew):
 
         brew install ocaml opam
-        opam init
-        eval `opam config env`
-        opam install ocamlfind camlp5
-
-        (Here, we used [OPAM](http://opam.ocaml.org/) to build and install
-        ocamlfind).
-
-
-On Windows:
-
-        See the "CROSS-COMPILATION" section below for instructions on building a
-        Windows executable on Linux and Mac OS X.
 
-        For building native Windows binaries, download and install OCaml for
-        Windows from here:
+On other systems, see [OPAM](http://opam.ocaml.org/)
 
-                http://protz.github.io/ocaml-installer/
+To cross-compile for Windows, see the "CROSS-COMPILATION" section below
 
-        For building under Cygwin, follow the steps from the next section.
 
+2. Configure opam
 
-On other Unix systems:
-
-        * OCaml (>= 4.03.0)
-
-                OCaml is available in most Linux distributions. It can also be
-                downloaded from here:
-
-                        http://caml.inria.fr/download.en.html
-
-        * Camlp5 -- Caml Preprocessor and Pretty Printer
-
-        * Ocamlfind/Findlib -- library manager for OCaml
-
-                If installed separately form prepackaged OCaml distribution, it
-                is recommended to build it from sources after you build/install
-                OCaml. It can be downloaded from here:
+        opam init
+        eval `opam config env`
 
-                        http://projects.camlcity.org/projects/findlib.html
-        * C compiler
 
-                GCC works. Other compilers should work if they are able to build
-                OCaml.
+3. Install dependencies
 
-        * GNU make, sed and probably some other tools.
+        opam install --deps-only ./opam
 
 
 INSTALLATION INSTRUCTIONS
@@ -81,7 +52,7 @@ INSTALLATION INSTRUCTIONS
 
         Run "./configure --help" for the list of available options.
 
-2. Build third-party dependencies
+2. Install third-party dependencies
 
         make deps
 
diff --git a/Makefile b/Makefile
index 33080b7d..777ba78b 100644
--- a/Makefile
+++ b/Makefile
@@ -24,9 +24,10 @@ endif
 
 
 deps: build-dir
-	$(MAKE) -C deps download
-	$(MAKE) -C deps
-	$(MAKE) -C deps uninstall install
+	@#$(MAKE) -C deps download
+	@#$(MAKE) -C deps
+	@#$(MAKE) -C deps uninstall install
+	opam install -y --deps-only ./opam
 
 
 build-dir:
diff --git a/configure b/configure
index 4ad06d82..c6d470d7 100755
--- a/configure
+++ b/configure
@@ -101,6 +101,20 @@ then
     # skipping everything else in cross-compilation mode; namely, we have to
     # cross-compile deps that come with the package
     echo "export SYSTEM := unix" >> $M
+
+    # figure out which dependencies we need to build
+    echo "checking whether necessary dependencies are already installed..."
+    for i in xmlm easy-format base64
+    do
+        dir="`$OCAMLFIND query $i 2>/dev/null`"
+        if [ $? -eq 0 ]
+        then
+            echo "$i is installed in $dir"
+            echo "SKIP-$i = 1" >> $M
+        else
+            echo "$i is not installed; it will be built during \"make deps\""
+        fi
+    done
 else
     # detecting the type of OCaml toolchain
     system="`ocamlc -config 2>/dev/null | grep system | sed 's/system: //'`"
@@ -109,19 +123,6 @@ else
     echo "detected $system OCaml toolchain"
 fi
 
-# figure out which dependencies we need to build
-echo "checking whether necessary dependencies are already installed..."
-for i in xmlm ulex-camlp5 easy-format base64
-do
-    dir="`$OCAMLFIND query $i 2>/dev/null`"
-    if [ $? -eq 0 ]
-    then
-        echo "$i is installed in $dir"
-        echo "SKIP-$i = 1" >> $M
-    else
-        echo "$i is not installed; it will be built during \"make deps\""
-    fi
-done
 
 OCAML_VERSION=`$OCAMLFIND ocamlc -version`
 if [ $? -ne 0 ]
diff --git a/deps/Makefile b/deps/Makefile
index ce40a70c..ea39457c 100644
--- a/deps/Makefile
+++ b/deps/Makefile
@@ -16,10 +16,6 @@ endif
 export OCAMLPATH
 
 
-ifndef SKIP-ulex
-DIRS += ulex-camlp5
-endif
-
 ifndef SKIP-xmlm
 DIRS += xmlm
 endif
diff --git a/deps/ulex-camlp5/META b/deps/ulex-camlp5/META
deleted file mode 100644
index 2fb35716..00000000
--- a/deps/ulex-camlp5/META
+++ /dev/null
@@ -1,6 +0,0 @@
-version = "1.1"
-description = "Runtime support for ulex"
-archive(byte) = "ulexing.cma"
-archive(native) = "ulexing.cmxa"
-archive(syntax,toploop) = "pa_ulex.cma ulexing.cma"
-archive(syntax,preprocessor) = "pa_ulex.cma"
diff --git a/deps/ulex-camlp5/Makefile b/deps/ulex-camlp5/Makefile
deleted file mode 100644
index 9e9a1f91..00000000
--- a/deps/ulex-camlp5/Makefile
+++ /dev/null
@@ -1,40 +0,0 @@
-OCAMLMAKEFILE := ../../make/OCamlMakefile
-
-
-RESULT = ulexing
-
-SOURCES = cset.ml utf8.ml ulexing.ml utf16.ml ulex.ml
-
-
-OCAMLFIND_PACKAGE = ulex-camlp5
-
-MODS = ulexing utf16 utf8
-LIBINSTALL_FILES = \
-	$(wildcard $(MODS:=.mli) $(MODS:=.cmi) $(MODS:=.cmx) pa_ulex.cma ulexing.a ulexing.cma ulexing.cmxa)
-
-PACKS = camlp5
-
-
-all: bcl ncl pa_ulex.cma
-
-
-install: libinstall
-
-
-uninstall: libuninstall
-
-
-pa_ulex.cma:
-	$(MAKE) -f Makefile.camlp5
-
-
-download:
-	$(MAKE) -f Makefile.download
-
-
-clean::
-	$(MAKE) -f Makefile.camlp5 clean
-	$(MAKE) -f Makefile.download clean
-
-
-include $(OCAMLMAKEFILE)
diff --git a/deps/ulex-camlp5/Makefile.camlp5 b/deps/ulex-camlp5/Makefile.camlp5
deleted file mode 100644
index 317b81c1..00000000
--- a/deps/ulex-camlp5/Makefile.camlp5
+++ /dev/null
@@ -1,16 +0,0 @@
-OCAMLMAKEFILE := ../../make/OCamlMakefile
-unexport OCAMLFIND_TOOLCHAIN
-
-
-RESULT = pa_ulex
-
-SOURCES = cset.ml utf8.ml ulexing.ml utf16.ml ulex.ml pa_ulex.ml
-
-
-INCDIRS = +camlp5
-
-
-all: bcl
-
-
-include $(OCAMLMAKEFILE)
diff --git a/deps/ulex-camlp5/Makefile.download b/deps/ulex-camlp5/Makefile.download
deleted file mode 100644
index ab55db3b..00000000
--- a/deps/ulex-camlp5/Makefile.download
+++ /dev/null
@@ -1,38 +0,0 @@
-TARBALL := v1.2-camlp5.tar.gz
-URL := https://github.com/sacerdot/ulex/archive/$(TARBALL)
-MD5 := 60aeff1073fe9bbb1be46b2da3a7618a
-
-
-UPSTREAM_FILES := \
-	cset.ml ulexing.ml ulexing.mli ulex.ml ulex.mli utf8.ml utf8.mli utf16.ml utf16.mli \
-        pa_ulex.ml
-
-
-UPSTREAM_DIR := upstream
-
-
-all: $(TARBALL) $(UPSTREAM_DIR) $(UPSTREAM_FILES)
-
-
-$(UPSTREAM_FILES): %: $(UPSTREAM_DIR)/%
-	cp $< $@
-
-
-pa_ulex.ml: %: $(UPSTREAM_DIR)/%
-	echo "(*pp camlp5o pa_macro.cmo pa_extend.cmo q_MLast.cmo *)" > $@
-	cat $< >> $@
-
-
-$(UPSTREAM_DIR): $(TARBALL)
-	mkdir -p $(UPSTREAM_DIR)
-	tar -xzf $< -C $(UPSTREAM_DIR) --strip-components=1
-
-
-$(TARBALL):
-	../../make/http-download $(URL)
-	test $(MD5) = `../../make/md5sum $(TARBALL)`
-
-
-clean:
-	rm -rf $(TARBALL) $(UPSTREAM_DIR) $(UPSTREAM_FILES)
-
diff --git a/opam b/opam
index 17838473..afc107e1 100644
--- a/opam
+++ b/opam
@@ -24,7 +24,7 @@ depends: [
   "ocaml" {>= "4.03"}
   "ocamlfind" {build}
   "easy-format"
-  "ulex-camlp5"
+  "sedlex"
   "xmlm"
   "base64" {>="3.1.0"}
 ]
diff --git a/piqilib/META.in b/piqilib/META.in
index 30043603..8e2c0b16 100644
--- a/piqilib/META.in
+++ b/piqilib/META.in
@@ -1,4 +1,4 @@
 description = "The Piqi library -- runtime support for multi-format Protobuf/JSON/XML/Piq data serialization and conversion"
-requires = "ulex-camlp5 easy-format xmlm base64"
+requires = "easy-format xmlm base64 sedlex"
 archive(byte) = "piqilib.cma"
 archive(native) = "piqilib.cmxa"
diff --git a/piqilib/Makefile b/piqilib/Makefile
index 4878ebd5..74066842 100644
--- a/piqilib/Makefile
+++ b/piqilib/Makefile
@@ -9,8 +9,7 @@ LIBINSTALL_FILES += \
 	$(wildcard *.cmi) \
 
 
-PACKS = ulex-camlp5 easy-format xmlm base64
-
+PACKS = easy-format xmlm base64 sedlex sedlex.ppx
 
 SOURCES = \
 	piqi_version.ml \
@@ -39,6 +38,7 @@ SOURCES = \
 	piqi_protobuf.ml \
 	piqi_db.ml \
 	\
+	piqi_utf8.ml \
 	piq_lexer.ml \
 	piq_parser.ml \
 	piq_gen.ml \
diff --git a/piqilib/piq_lexer.ml b/piqilib/piq_lexer.ml
index c0f8156b..969f81d5 100644
--- a/piqilib/piq_lexer.ml
+++ b/piqilib/piq_lexer.ml
@@ -1,4 +1,3 @@
-(*pp camlp5o -I `ocamlfind query ulex-camlp5` pa_ulex.cma *)
 (*
    Copyright 2009, 2010, 2011, 2012, 2013, 2014, 2015, 2017 Anton Lavrik
 
@@ -19,6 +18,9 @@ module C = Piqi_common
 open C.Std
 
 
+module Utf8 = Piqi_utf8
+
+
 exception Error0 of string (* internally used error exception *)
 
 
@@ -87,9 +89,9 @@ let type_of_char c =
   else String_b
 
 
-let regexp digit = ['0'-'9']
-let regexp odigit = ['0'-'7']
-let regexp xdigit = ['0'-'9''a'-'f''A'-'F']
+let digit = [%sedlex.regexp? '0'..'9']
+let odigit = [%sedlex.regexp? '0'..'7']
+let xdigit = [%sedlex.regexp? '0'..'9' | 'a'..'f' | 'A'..'F']
 
 
 let make_char c =
@@ -98,13 +100,12 @@ let make_char c =
 
 let escaped_lexeme lexbuf =
   (* strip the first symbol *)
-  let start = Ulexing.get_start lexbuf in
-  let pos = Ulexing.get_pos lexbuf in
-  Ulexing.utf8_sub_lexeme lexbuf 1 (pos - start - 1)
+  let len = Sedlexing.lexeme_length lexbuf in
+  Sedlexing.Utf8.sub_lexeme lexbuf 1 (len - 1)
 
 
 (* XXX: add support for a b f v escapes? *)
-let parse_string_escape = lexer
+let parse_string_escape lexbuf = [%sedlex match lexbuf with
   | '\\' -> make_char '\\'
   | '"' -> make_char '"'
   | 't' -> make_char '\t'
@@ -113,26 +114,29 @@ let parse_string_escape = lexer
   (* XXX: disable it for now, since specifying decimals this way may make more
    * sense:
   | odigit odigit odigit ->
-      let c = int_of_ostring (Ulexing.utf8_lexeme lexbuf) in
+      let c = int_of_ostring (Sedlexing.Utf8.lexeme lexbuf) in
       (type_of_char c),c
   *)
-  | "x" xdigit xdigit ->
+  | 'x', xdigit, xdigit ->
       let c = int_of_xstring (escaped_lexeme lexbuf) in
       (type_of_char c),c
-  | 'u' xdigit xdigit xdigit xdigit ->
+  | 'u', xdigit, xdigit, xdigit, xdigit ->
       let c = int_of_xstring (escaped_lexeme lexbuf) in
       String_u,c
-  | 'U' xdigit xdigit xdigit xdigit xdigit xdigit xdigit xdigit ->
+  | 'U', xdigit, xdigit, xdigit, xdigit, xdigit, xdigit, xdigit, xdigit ->
       (* XXX: check code validity so that it doesn't exeed allocated limit *)
       let c = int_of_xstring (escaped_lexeme lexbuf) in
       String_u,c
-  | _ -> error "invalid string escape literal"
+  | _ ->
+      let s = Sedlexing.Utf8.lexeme lexbuf in
+      error ("invalid string escape literal " ^ s)
+]
 
 
 (* returns the list of integers representing codepoints *)
 (* XXX: allow only printable characters in strings? *)
 (* XXX: provide a method for wraping a string to several lines? *)
-let rec parse_string_literal ltype l = lexer
+let rec parse_string_literal ltype l lexbuf = [%sedlex match lexbuf with
   | '\\' ->
       let ctype, c = parse_string_escape lexbuf in
       let ltype =
@@ -146,14 +150,15 @@ let rec parse_string_literal ltype l = lexer
           | _,_ -> ltype (* leave the previous type *)
         in
       parse_string_literal ltype (c::l) lexbuf
-  | [0-0x1F 127] -> (* XXX: what about unicode non-printable chars? *)
+  | (0 .. 0x1F) | 127 -> (* XXX: what about unicode non-printable chars? *)
       (* do not allow non-printables to appear in string literals -- one
        * should use correspondent escaped specifications instead *)
       error "invalid string literal" 
   | eof ->
       ltype,(List.rev l)
-  | _ ->
-      let c = Ulexing.lexeme_char lexbuf 0 in
+  | any ->
+      let c = Sedlexing.lexeme_char lexbuf 0 in
+      let c = Uchar.to_int c in
       let ltype =
         match ltype with
            String_b when c > 127 -> error "invalid string literal" 
@@ -161,6 +166,11 @@ let rec parse_string_literal ltype l = lexer
          | _ -> ltype
       in
       parse_string_literal ltype (c::l) lexbuf
+  | _ ->
+      let c = Sedlexing.lexeme_char lexbuf 0 in
+      let c = Uchar.to_int c in
+      error ("invalid string codepoint " ^ string_of_int c)
+]
 
 
 let utf8_of_list l =
@@ -181,7 +191,7 @@ let string_of_list l =
 
 
 let parse_string_literal s =
-  let lexbuf = Ulexing.from_utf8_string s in
+  let lexbuf = Sedlexing.Utf8.from_string s in
   let str_type, l = parse_string_literal String_a [] lexbuf in
   let parsed_str =
     match str_type with
@@ -260,29 +270,27 @@ type token =
   | Raw_string of string
 
 
-let regexp newline = ('\n' | "\r\n")
-let regexp ws = [' ' '\t']+
+let newline = [%sedlex.regexp? '\n' | "\r\n"]
+let ws = [%sedlex.regexp? Plus (' ' | '\t')]
 
+let name = [%sedlex.regexp? (':' | '.'), Plus ('a'..'z' | 'A'..'Z' | '0'..'9' | '-' | '_' | '/' | '.' | ':')]
 
-let regexp name = [':' '.'] ['a'-'z' 'A'-'Z' '0'-'9' '-' '_' '/' '.' ':']+
-
+let string_literal = [%sedlex.regexp? '"', Star (Compl '"' | "\\\""), '"']
 
 (* ASCII alphanumeric, '-', '_', '.', '/' for representing numbers and unquoted
  * strings (useful e.g. as DSL identifiers)
  *
  * XXX: include all alphanumeric Unicode? *)
-let regexp first_word_char = ['a'-'z' 'A'-'Z' '0'-'9' '-' '_']
-
-let regexp word_char = (first_word_char | '.' | '/')
+let first_word_char = [%sedlex.regexp? ('a'..'z' | 'A'..'Z' | '0'..'9' | '-' | '_')]
 
-let regexp word = first_word_char word_char *
+let word_char = [%sedlex.regexp? (first_word_char | '.' | '/')]
 
+let word = [%sedlex.regexp? first_word_char, Star (word_char)]
 
-let regexp float_literal =
-  ['0'-'9'] ['0'-'9' '_']*
-  ('.' ['0'-'9' '_']* )?
-  (['e' 'E'] ['+' '-']? ['0'-'9'] ['0'-'9' '_']* )?
-
+let float_literal =
+  [%sedlex.regexp?  ('0'..'9'), Star ('0'..'9' | '_'),
+  Opt ('.', Star ('0'..'9' | '_')),
+  Opt (('e' | 'E'), Opt ('+' | '-'), '0'..'9', Star ('0'..'9' | '_'))]
 
 let is_valid_first_word_char = function
   | 'a'..'z' | 'A'..'Z' | '0'..'9' | '-' | '_' -> true
@@ -292,8 +300,7 @@ let is_valid_word_char = function
   | '.' | '/' -> true
   | x -> is_valid_first_word_char x
 
-
-(* accepts the same language as the regexp above *)
+(* accepts the same language as the word regexp above *)
 let is_valid_word s =
   let len = String.length s in
   (* NOTE: it works transparently on utf8 strings *)
@@ -314,7 +321,7 @@ let is_valid_word s =
 
 type buf =
   {
-    lexbuf : Ulexing.lexbuf;
+    lexbuf : Sedlexing.lexbuf;
 
     mutable lcount : int; (* line counter *)
     mutable lstart : int; (* buffer position of the latest line *)
@@ -336,12 +343,12 @@ let make_buf lexbuf =
 
 let update_line_counter buf =
   buf.lcount <- buf.lcount + 1;
-  buf.lstart <- Ulexing.lexeme_end buf.lexbuf
+  buf.lstart <- Sedlexing.lexeme_end buf.lexbuf
 
 
 let get_column buf = 
   (* NOTE: ennumerating columns from 1 *)
-  (Ulexing.lexeme_start buf.lexbuf) - buf.lstart + 1
+  (Sedlexing.lexeme_start buf.lexbuf) - buf.lstart + 1
 
 
 let update_column buf =
@@ -358,7 +365,7 @@ let location buf =
   buf.lcount, buf.col
 
 
-let rec token0 buf = lexer
+let rec token0 buf lexbuf = [%sedlex match lexbuf with
   | newline ->
       (* update line counter, drop column counter and move on *)
       update_line_counter buf;
@@ -369,16 +376,16 @@ let rec token0 buf = lexer
       error "invalid character"
   | "%%" ->
       error "'%%' literal is reserved for future versions"
-  | '%' ( [^'%' '\n'] [^'\n']* )? newline? -> (* skip single line comment *)
+  | '%', Opt ( Compl ('%' | '\n'), Star (Compl '\n') ), Opt newline -> (* skip single line comment *)
       update_line_counter buf;
       token0 buf lexbuf 
-  | '#' newline? -> (* verbatim empty text *)
+  | '#', Opt newline -> (* verbatim empty text *)
       Text ""
-  | '#' [^' '] ->
+  | '#', Compl ' ' ->
       error "space is expected after '#'"
-  | '#' [' '] [^'\n']* newline? -> (* verbatim text *)
+  | '#', ' ', Star (Compl '\n'), Opt newline -> (* verbatim text *)
       (* TODO: restrict string literal to contain only printable characters *)
-      let s = Ulexing.utf8_lexeme lexbuf in
+      let s = Sedlexing.Utf8.lexeme lexbuf in
       let len = String.length s in
       if len = 0
       then Text ""
@@ -399,8 +406,8 @@ let rec token0 buf = lexer
   | ']' -> Rbr
   | '*' -> Star
   | ',' -> Comma
-  | '"'([^'"']|"\\\"")*'"' -> (* string literal *)
-      let s = Ulexing.utf8_lexeme lexbuf in
+  | string_literal ->
+      let s = Sedlexing.Utf8.lexeme lexbuf in
       let s = String.sub s 1 (String.length s - 2) in (* cut double-quotes *)
 
       let (str_type, parsed_str) = parse_string_literal s in
@@ -408,10 +415,10 @@ let rec token0 buf = lexer
 
   | '"' -> error "string literal overrun"
   | name ->
-      let s = Ulexing.utf8_lexeme lexbuf in
+      let s = Sedlexing.Utf8.lexeme lexbuf in
       Name s
   | word ->
-      let s = Ulexing.utf8_lexeme lexbuf in
+      let s = Sedlexing.Utf8.lexeme lexbuf in
       Word s
 
   (* TODO: this is inconsistent - here, integers qualify as words automatically,
@@ -420,11 +427,12 @@ let rec token0 buf = lexer
    * lexing error
    *)
   | float_literal ->
-      let s = Ulexing.utf8_lexeme lexbuf in
+      let s = Sedlexing.Utf8.lexeme lexbuf in
       Word s
 
   | eof -> EOF
   | _ -> error "invalid character"
+]
 
 
 (* error reporter *)
@@ -444,10 +452,9 @@ let token1 buf =
     tok
   with
     | Error0 s -> error buf s
-    | Ulexing.Error -> error buf "lexing internal error"
-    | Ulexing.InvalidCodepoint i -> 
+    | Sedlexing.InvalidCodepoint i -> 
         error buf ("invalid unicode code point " ^ string_of_int i)
-    | Utf8.MalFormed ->
+    | Sedlexing.MalFormed ->
         error buf "malformed utf-8"
 
 
@@ -465,17 +472,17 @@ let token buf =
 
 
 let init_from_string s =
-  let lexbuf = Ulexing.from_utf8_string s in
+  let lexbuf = Sedlexing.Utf8.from_string s in
   make_buf lexbuf
 
 
 let init_from_stream s =
-  let lexbuf = Ulexing.from_utf8_stream s in
+  let lexbuf = Sedlexing.Utf8.from_stream s in
   make_buf lexbuf
 
 
 let init_from_channel ch =
-  let lexbuf = Ulexing.from_utf8_channel ch in
+  let lexbuf = Sedlexing.Utf8.from_channel ch in
   make_buf lexbuf
 
 
diff --git a/piqilib/piqi_json_parser.mll b/piqilib/piqi_json_parser.mll
index b8afe935..255fcf37 100644
--- a/piqilib/piqi_json_parser.mll
+++ b/piqilib/piqi_json_parser.mll
@@ -164,13 +164,13 @@ THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
       if i = pos + bytes
       then n
       else begin
-        let w = Utf8.width.(Char.code s.[i]) in
+        let w = Piqi_utf8.width.(Char.code s.[i]) in
         if w > 0 && i + w <= pos + bytes
         then (
           (* check if the next unicode char is correctly encoded in utf8 *)
-          ignore (Utf8.next s i);
+          ignore (Piqi_utf8.next s i);
           aux (succ n) (i + w)
-        ) else raise Utf8.MalFormed
+        ) else raise Piqi_utf8.MalFormed
       end
     in
     aux 0 pos
@@ -179,7 +179,7 @@ THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
   let check_adjust_utf8 v lexbuf s start len =
     let utf8_len =
       try utf8_length s start len
-      with Utf8.MalFormed ->
+      with Piqi_utf8.MalFormed ->
         custom_error "Invalid utf-8 sequence" v lexbuf
     in
     v.utf8_delta <- v.utf8_delta + (len - utf8_len)
@@ -345,7 +345,7 @@ and finish_escaped_char v = parse
   | 't'  { Buffer.add_char v.buf '\t' }
   | 'u' (( hex hex hex hex ) as s)
          { let i = Piq_lexer.int_of_xstring s in
-           Utf8.store v.buf i }
+           Piqi_utf8.store v.buf i }
   | _    { lexer_error "Invalid escape sequence" v lexbuf }
   | eof  { custom_error "Unexpected end of input" v lexbuf }
 
diff --git a/piqilib/piqi_utf8.ml b/piqilib/piqi_utf8.ml
new file mode 100644
index 00000000..c2aeee8a
--- /dev/null
+++ b/piqilib/piqi_utf8.ml
@@ -0,0 +1,168 @@
+(*
+
+This is a copy of ulex/utf8.ml from the ulex package.
+
+
+The package ulex is released under the terms of an MIT-like license.
+
+Copyright 2005 by Alain Frisch.
+
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+
+The above copyright notice and this permission notice shall be
+included in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+*)
+
+exception MalFormed
+
+(* cf http://www.faqs.org/rfcs/rfc3629.html *)
+
+let width = Array.make 256 (-1)
+let () =
+  for i = 0 to 127 do width.(i) <- 1 done;
+  for i = 192 to 223 do width.(i) <- 2 done;
+  for i = 224 to 239 do width.(i) <- 3 done;
+  for i = 240 to 247 do width.(i) <- 4 done
+
+let next s i =
+  match s.[i] with
+    | '\000'..'\127' as c ->
+        Char.code c
+    | '\192'..'\223' as c ->
+	let n1 = Char.code c in
+	let n2 = Char.code s.[i+1] in
+        if (n2 lsr 6 != 0b10) then raise MalFormed;
+        ((n1 land 0x1f) lsl 6) lor (n2 land 0x3f)
+    | '\224'..'\239' as c ->
+	let n1 = Char.code c in
+	let n2 = Char.code s.[i+1] in
+	let n3 = Char.code s.[i+2] in
+        if (n2 lsr 6 != 0b10) || (n3 lsr 6 != 0b10) then raise MalFormed;
+	let p = 
+          ((n1 land 0x0f) lsl 12) lor ((n2 land 0x3f) lsl 6) lor (n3 land 0x3f)
+	in
+	if (p >= 0xd800) && (p <= 0xdf00) then raise MalFormed;
+	p
+    | '\240'..'\247' as c ->
+	let n1 = Char.code c in
+	let n2 = Char.code s.[i+1] in
+	let n3 = Char.code s.[i+2] in
+	let n4 = Char.code s.[i+3] in
+        if (n2 lsr 6 != 0b10) || (n3 lsr 6 != 0b10) || (n4 lsr 6 != 0b10)
+	then raise MalFormed;
+        ((n1 land 0x07) lsl 18) lor ((n2 land 0x3f) lsl 12) lor
+        ((n3 land 0x3f) lsl 6) lor (n4 land 0x3f)
+    | _ -> raise MalFormed
+
+
+(* With this implementation, a truncated code point will result
+   in Stream.Failure, not in MalFormed. *)
+
+let from_stream s =
+  match Stream.next s with
+    | '\000'..'\127' as c ->
+        Char.code c
+    | '\192'..'\223' as c ->
+	let n1 = Char.code c in
+	let n2 = Char.code (Stream.next s) in
+        if (n2 lsr 6 != 0b10) then raise MalFormed;
+        ((n1 land 0x1f) lsl 6) lor (n2 land 0x3f)
+    | '\224'..'\239' as c ->
+	let n1 = Char.code c in
+	let n2 = Char.code (Stream.next s) in
+	let n3 = Char.code (Stream.next s) in
+        if (n2 lsr 6 != 0b10) || (n3 lsr 6 != 0b10) then raise MalFormed;
+        ((n1 land 0x0f) lsl 12) lor ((n2 land 0x3f) lsl 6) lor (n3 land 0x3f)
+    | '\240'..'\247' as c ->
+	let n1 = Char.code c in
+	let n2 = Char.code (Stream.next s) in
+	let n3 = Char.code (Stream.next s) in
+	let n4 = Char.code (Stream.next s) in
+        if (n2 lsr 6 != 0b10) || (n3 lsr 6 != 0b10) || (n4 lsr 6 != 0b10)
+	then raise MalFormed;
+        ((n1 land 0x07) lsl 18) lor ((n2 land 0x3f) lsl 12) lor
+        ((n3 land 0x3f) lsl 6) lor (n4 land 0x3f)
+    | _ -> raise MalFormed
+
+
+
+let compute_len s pos bytes =
+  let rec aux n i =
+    if i >= pos + bytes then if i = pos + bytes then n else raise MalFormed
+    else 
+      let w = width.(Char.code s.[i]) in
+      if w > 0 then aux (succ n) (i + w) 
+      else raise MalFormed
+  in
+  aux 0 pos
+
+let rec blit_to_int s spos a apos n =
+  if n > 0 then begin
+    a.(apos) <- next s spos;
+    blit_to_int s (spos + width.(Char.code s.[spos])) a (succ apos) (pred n)
+  end
+
+let to_int_array s pos bytes =
+  let n = compute_len s pos bytes in
+  let a = Array.make n 0 in
+  blit_to_int s pos a 0 n;
+  a
+
+(**************************)
+
+let width_code_point p =
+  if p <= 0x7f then 1
+  else if p <= 0x7ff then 2
+  else if p <= 0xffff then 3
+  else if p <= 0x10ffff then 4
+  else raise MalFormed
+
+let store b p =
+  if p <= 0x7f then
+    Buffer.add_char b (Char.chr p)
+  else if p <= 0x7ff then (
+    Buffer.add_char b (Char.chr (0xc0 lor (p lsr 6)));
+    Buffer.add_char b (Char.chr (0x80 lor (p land 0x3f)))
+  )
+  else if p <= 0xffff then (
+    if (p >= 0xd800 && p < 0xe000) then raise MalFormed;
+    Buffer.add_char b (Char.chr (0xe0 lor (p lsr 12)));
+    Buffer.add_char b (Char.chr (0x80 lor ((p lsr 6) land 0x3f)));
+    Buffer.add_char b (Char.chr (0x80 lor (p land 0x3f)))
+  )
+  else if p <= 0x10ffff then (
+    Buffer.add_char b (Char.chr (0xf0 lor (p lsr 18)));
+    Buffer.add_char b (Char.chr (0x80 lor ((p lsr 12) land 0x3f)));
+    Buffer.add_char b (Char.chr (0x80 lor ((p lsr 6)  land 0x3f)));
+    Buffer.add_char b (Char.chr (0x80 lor (p land 0x3f)))
+  )
+  else raise MalFormed
+
+
+let from_int_array a apos len =
+  let b = Buffer.create (len * 4) in
+  let rec aux apos len =
+    if len > 0 then (store b a.(apos); aux (succ apos) (pred len))
+    else Buffer.contents b in
+  aux apos len
+
+let stream_from_char_stream s =
+  Stream.from 
+    (fun _ ->
+       try Some (from_stream s)
+       with Stream.Failure -> None)
+    
diff --git a/src/Makefile b/src/Makefile
index d5f9b159..ba63399d 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -12,7 +12,7 @@ PACKS = unix
 # OCAMLPATH to reduce complexity of the windows build (e.g. avoid converting :
 # path separators to ; along with reverting slashes); we can't simply symlink it
 # to the build dir either because symlinks don't work on windows
-PACKS += ulex-camlp5 easy-format xmlm base64
+PACKS += easy-format xmlm base64 sedlex sedlex.ppx
 INCDIRS = ../piqilib
 LIBS = ../piqilib/piqilib
 
diff --git a/src/of_proto.ml b/src/of_proto.ml
index ea5f8e1e..d81c29cc 100644
--- a/src/of_proto.ml
+++ b/src/of_proto.ml
@@ -1,4 +1,3 @@
-(*pp camlp5o -I `ocamlfind query ulex-camlp5` pa_ulex.cma *)
 (*
    Copyright 2009, 2010, 2011, 2012, 2013, 2014, 2015, 2017 Anton Lavrik
 
@@ -244,12 +243,12 @@ Reference implementation from protobuf-2.3.0/src/google/protobuf/stubs/strutil.c
 *)
 
 
-let regexp digit = ['0'-'9']
-let regexp odigit = ['0'-'7']
-let regexp xdigit = ['0'-'9''a'-'f''A'-'F']
+let digit = [%sedlex.regexp? '0'..'9']
+let odigit = [%sedlex.regexp? '0'..'7']
+let xdigit = [%sedlex.regexp? '0'..'9' | 'a'..'f' | 'A'..'F']
 
 
-let parse_string_escape = lexer
+let parse_string_escape lexbuf = [%sedlex match lexbuf with
   | 'a' ->  '\007'
   | 'b' ->  '\008'
   | 'f' ->  '\012'
@@ -261,34 +260,37 @@ let parse_string_escape = lexer
   | '?' ->  '?'
   | '\'' -> '\''
   | '"' ->  '"'
-  | odigit odigit odigit ->
-      let v = Ulexing.latin1_lexeme lexbuf in
+  | odigit, odigit, odigit ->
+      let v = Sedlexing.Latin1.lexeme lexbuf in
       Char.chr (Piq_lexer.int_of_ostring v)
-  | ("x" | "X") xdigit xdigit ->
-      let v = Ulexing.latin1_sub_lexeme lexbuf 1 2
-      in
+  | ("x" | "X"), xdigit, xdigit ->
+      let v = Sedlexing.Latin1.sub_lexeme lexbuf 1 2 in
       Char.chr (Piq_lexer.int_of_xstring v)
   | _ ->
       piqi_error "error: invalid string escape literal in .proto"
+]
 
 
 (* returns the list of integers representing codepoints *)
-let rec parse_string_literal buf = lexer
+let rec parse_string_literal buf lexbuf = [%sedlex match lexbuf with
   | '\\' ->
       let c = parse_string_escape lexbuf in
       Buffer.add_char buf c;
       parse_string_literal buf lexbuf
   | eof ->
       Buffer.contents buf
-  | _ ->
-      let c = Ulexing.latin1_lexeme_char lexbuf 0 in
+  | any ->
+      let c = Sedlexing.Latin1.lexeme_char lexbuf 0 in
       Buffer.add_char buf c;
       parse_string_literal buf lexbuf
+  | _ ->
+      piqi_error "invalid string character in .proto"
+]
 
 
 let parse_string_literal s =
   let buf = Buffer.create (String.length s) in
-  let lexbuf = Ulexing.from_latin1_string s in
+  let lexbuf = Sedlexing.Latin1.from_string s in
   parse_string_literal buf lexbuf
 
 
diff --git a/src/piqi_http.ml b/src/piqi_http.ml
index a3fa9672..d83e04ad 100644
--- a/src/piqi_http.ml
+++ b/src/piqi_http.ml
@@ -1,4 +1,3 @@
-(*pp camlp5o -I `ocamlfind query ulex-camlp5` pa_ulex.cma *)
 (*
    Copyright 2009, 2010, 2011, 2012, 2013, 2014, 2015, 2017, 2018 Anton Lavrik
 
@@ -27,13 +26,13 @@
  * definitions from HTTP/1.1 standard
  *)
 
-let regexp cr = '\r'
-let regexp lf = '\n'
-let regexp sp = ' '  (* space *)
-let regexp ht = '\t' (* horisontal tab *)
-let regexp crlf = cr lf
+let cr = [%sedlex.regexp? '\r']
+let lf = [%sedlex.regexp? '\n']
+let sp = [%sedlex.regexp? ' ' ]  (* space *)
+let ht = [%sedlex.regexp? '\t']  (* horisontal tab *)
+let crlf = [%sedlex.regexp? cr, lf]
 
-let regexp digit = ['0'-'9']
+let digit = [%sedlex.regexp? '0'..'9']
 
 
 (*
@@ -47,8 +46,8 @@ let regexp digit = ['0'-'9']
   TEXT           = <any OCTET except CTLs,
                    but including LWS>
 *)
-let regexp lws = crlf? (sp | ht)+
-let regexp text = [^ 0-31 127] | lws
+let lws = [%sedlex.regexp? Opt crlf, Plus (sp | ht)]
+let text = [%sedlex.regexp? Compl (0 .. 31 | 127) | lws]
 
 
 (*
@@ -58,7 +57,7 @@ let regexp text = [^ 0-31 127] | lws
                  | "/" | "[" | "]" | "?" | "="
                  | "{" | "}" | SP | HT
 *)
-let regexp token = [^ "()<>@,;:\\\"/[]?={} \t" 0-31 127-255 ]+
+let token = [%sedlex.regexp? Plus (Compl ((Chars "()<>@,;:\\\"/[]?={} \t") | 0 .. 31 | 127 .. 255))]
 
 
 (* HTTP message:
@@ -75,17 +74,17 @@ let regexp token = [^ "()<>@,;:\\\"/[]?={} \t" 0-31 127-255 ]+
   Reason-Phrase  = *<TEXT, excluding CR, LF>
 *)
 
-let regexp http_version = "HTTP" "/" digit+ "." digit+
+let http_version = [%sedlex.regexp? "HTTP", "/", Plus digit, ".", Plus digit]
 
-let regexp status_code = digit digit digit
+let status_code = [%sedlex.regexp? digit, digit, digit]
 
 (* XXX: what does excluding CR, LF mean? *)
-let regexp reason_phrase = [^ 0-31 127 '\r' '\n']*
+let reason_phrase = [%sedlex.regexp? Star (Compl (0 .. 31 | 127 | '\r' | '\n'))]
 
-let regexp status_line = http_version sp status_code sp reason_phrase crlf
+let status_line = [%sedlex.regexp? http_version, sp, status_code, sp, reason_phrase, crlf]
 
-let regexp status_line_head = http_version sp
-let regexp status_line_tail = sp reason_phrase crlf
+let status_line_head = [%sedlex.regexp? http_version, sp]
+let status_line_tail = [%sedlex.regexp? sp, reason_phrase, crlf]
 
 
 (* HTTP header:
@@ -98,11 +97,11 @@ let regexp status_line_tail = sp reason_phrase crlf
 *)
 
 (* NOTE, XXX: this regexp doesn't include quoted-string case, only "*TEXT" *)
-let regexp field_value = text*
+let field_value = [%sedlex.regexp? Star text]
 
-let regexp message_header = token ":" field_value?
+let message_header = [%sedlex.regexp? token, ":", Opt field_value]
 
-let regexp message_header_tail = ":" field_value?
+let message_header_tail = [%sedlex.regexp? ":", Opt field_value]
 
 
 (*
@@ -125,24 +124,27 @@ let handle_exn context exn =
   in error (context ^ ": " ^ s)
 
 
-let parse_status_line_head = lexer
+let parse_status_line_head lexbuf = [%sedlex match lexbuf with
   | status_line_head -> ()
   | eof -> unexpected_eof ()
   | _ -> invalid_character ()
+]
 
 
-let parse_status_code = lexer
+let parse_status_code lexbuf = [%sedlex match lexbuf with
   | status_code ->
-      let s = Ulexing.latin1_lexeme lexbuf in
+      let s = Sedlexing.Latin1.lexeme lexbuf in
       int_of_string s
   | eof -> unexpected_eof ()
   | _ -> invalid_character ()
+]
 
 
-let parse_status_line_tail = lexer
+let parse_status_line_tail lexbuf = [%sedlex match lexbuf with
   | status_line_tail -> ()
   | eof -> unexpected_eof ()
   | _ -> invalid_character ()
+]
 
 
 let parse_status_line lexbuf =
@@ -160,32 +162,36 @@ let skip_lexeme_ws lexbuf pos len =
     if pos = len (* end of lexeme *)
     then pos
     else
-      let i = Ulexing.lexeme_char lexbuf pos in
-      let c = Char.chr i in
-      match c with
-        | ' ' | '\t' -> aux (pos + 1) (* skip ws char *)
-        | _ -> pos
+      let c = Sedlexing.lexeme_char lexbuf pos in
+      let is_ws_char =
+        let code = Uchar.to_int c in
+        code = Char.code ' ' || code = Char.code '\t'
+      in
+      if is_ws_char
+      then aux (pos + 1)  (* skip ws char *)
+      else pos
   in
   aux pos
 
 
-let parse_message_header_tail = lexer
-  | message_header_tail crlf ->
+let parse_message_header_tail lexbuf = [%sedlex match lexbuf with
+  | message_header_tail, crlf ->
       (* chop crlf at the end of the string (-2) *)
-      let len = Ulexing.lexeme_length lexbuf - 2 in
+      let len = Sedlexing.lexeme_length lexbuf - 2 in
       (* skip ':' character in front of the value *)
       let pos = 1 in
       (* skip whitespace in front of the value *)
       let pos = skip_lexeme_ws lexbuf pos len in
-      let value = Ulexing.latin1_sub_lexeme lexbuf pos (len - pos) in
+      let value = Sedlexing.Latin1.sub_lexeme lexbuf pos (len - pos) in
       value
   | eof -> unexpected_eof ()
   | _ -> invalid_character ()
+]
 
 
-let rec parse_headers accu = lexer
+let rec parse_headers accu lexbuf = [%sedlex match lexbuf with
   | token ->
-      let field_name = Ulexing.latin1_lexeme lexbuf in
+      let field_name = Sedlexing.Latin1.lexeme lexbuf in
       let field_value = parse_message_header_tail lexbuf in
       (* NOTE: we'll have to live with the compilation warning about
        * String.lowercase being deprecated: can't use String.lowercase_ascii
@@ -198,12 +204,13 @@ let rec parse_headers accu = lexer
 
   | eof -> unexpected_eof ()
   | _ -> invalid_character ()
+]
 
 
 let parse_headers lexbuf =
   try
     let headers = parse_headers [] lexbuf in
-    let body_offset = Ulexing.lexeme_end lexbuf in
+    let body_offset = Sedlexing.lexeme_end lexbuf in
     (headers, body_offset)
   with exn ->
     handle_exn "error parsing HTTP headers" exn
@@ -215,7 +222,7 @@ let parse_response_header buf len =
     then buf
     else String.sub buf 0 len
   in
-  let lexbuf = Ulexing.from_latin1_string str in
+  let lexbuf = Sedlexing.Latin1.from_string str in
   let status_code = parse_status_line lexbuf in
   let headers, body_offset = parse_headers lexbuf in
   (status_code, headers, body_offset)