commit d4488c509652d1df5108f1b92748d076ebfffe0e Author: Félix Baylac-Jacqué Date: Tue Jun 29 21:02:27 2021 +0200 Initial benchmark: comparing haskell's Text with icu on EN corpus Run `make all` to run the benchmark. diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..5c98cdf --- /dev/null +++ b/Makefile @@ -0,0 +1,22 @@ +CFLAGS=-O2 -Wall $(shell pkgconf -cflags icu-io) +LDFLAGS=$(shell pkgconf -libs icu-io) +CC=gcc + +all: haskell-read-utf8 icu-read-utf8 bench +.PHONY: all + +bench: haskell-read-utf8 icu-read-utf8 + hyperfine ./haskell-read-utf8 ./icu-read-utf8 +.PHONE: bench + +haskell-read-utf8: ./haskell/HaskellReadUTF8.hs + ghc -o ./haskell-read-utf8 -O ./haskell/HaskellReadUTF8.hs + +icu-read-utf8: ./icu/icu-read-utf8.c + $(CC) $(CFLAGS) $(LDFLAGS) ./icu/icu-read-utf8.c -o ./icu-read-utf8 + +clean: + rm -f ./haskell/*.{o,hi} + rm -f ./haskell-read-utf8 + rm -f ./icu-read-utf8 +.PHONY: clean diff --git a/default.nix b/default.nix new file mode 100644 index 0000000..c882b00 --- /dev/null +++ b/default.nix @@ -0,0 +1,24 @@ +{ pkgs ? import {} }: + +let ghcClosure = + pkgs.haskellPackages.ghcWithPackages + (p:[ + p.text + ]); + +in pkgs.stdenv.mkDerivation { + pname = "bench-my-utf8"; + version = "0.0.1"; + installPhase = '' + mkdir -p $out/bin + mv haskell-read-utf8 $out/bin + ''; + nativeBuildInputs = [ + pkgs.gnumake + ghcClosure + pkgs.gcc + pkgs.icu + pkgs.hyperfine + pkgs.pkgconf + ]; +} diff --git a/haskell/HaskellReadUTF8.hs b/haskell/HaskellReadUTF8.hs new file mode 100644 index 0000000..7f45316 --- /dev/null +++ b/haskell/HaskellReadUTF8.hs @@ -0,0 +1,7 @@ +module Main where + +import qualified Data.Text.IO as TIO +import qualified Data.Text as T + +main :: IO () +main = TIO.readFile "text-test-data/english.txt" >> pure () diff --git a/icu/icu-read-utf8.c b/icu/icu-read-utf8.c new file mode 100644 index 0000000..a535392 --- /dev/null +++ b/icu/icu-read-utf8.c @@ -0,0 +1,11 @@ +#include +#include +#include + +int main(void) +{ + UFILE *in = u_fopen("text-test-data/english.txt", "r", NULL, "UTF-8"); + // Using a 10 MB buffer for now. It fits all the corpus I want to test agaisnt. + UChar *charBuff = malloc(10000000 * sizeof(UChar)); + int32_t i = u_file_read(charBuff, 10000000, in); +} diff --git a/readme.md b/readme.md new file mode 100644 index 0000000..2339344 --- /dev/null +++ b/readme.md @@ -0,0 +1,50 @@ +# UTF-8 Benchmark + +The idea is to benchmark several utf-8 aspects of the Haskell Text package. Namely utf-8 encoding/decoding and various unicode casing operations. Hopefully, we'll find ways to improve text's performance. + +For the time being, we're comparing the Text implementation with the C ICU one. In the future I also plan to test it against C++ Boost and the Rust stdlib. + +# Implemented so far + +- [x] UTF-8 decoding from file. + - [x] English + - [ ] Chinese + - [ ] French + - [ ] Russian +- [ ] UTF-8 encoding to file. + - [ ] English + - [ ] Chinese + - [ ] French + - [ ] Russian +- [ ] UTF-8 encoding to file. + - [ ] English + - [ ] Chinese + - [ ] French + - [ ] Russian + + +# Usage + +```bash +nix-shell +make all +``` + +# Findings + +## UTF-8 Decoding + +``` +hyperfine ./haskell-read-utf8 ./icu-read-utf8 +Benchmark #1: ./haskell-read-utf8 + Time (mean ± σ): 23.3 ms ± 0.9 ms [User: 14.9 ms, System: 8.3 ms] + Range (min … max): 22.0 ms … 26.1 ms 111 runs + +Benchmark #2: ./icu-read-utf8 + Time (mean ± σ): 12.5 ms ± 0.8 ms [User: 7.6 ms, System: 4.9 ms] + Range (min … max): 11.5 ms … 16.1 ms 176 runs + +Summary + './icu-read-utf8' ran + 1.85 ± 0.14 times faster than './haskell-read-utf8' +```