Initial benchmark: comparing haskell's Text with icu on EN corpus

Run `make all` to run the benchmark.
2021-06-29 21:02:27 +02:00 · 2021-06-29 21:02:27 +02:00 · d4488c5096
commit d4488c5096
5 changed files with 114 additions and 0 deletions
--- a/22
+++ b/22
@ -0,0 +1,22 @@
+CFLAGS=-O2 -Wall $(shell pkgconf -cflags icu-io)
+LDFLAGS=$(shell pkgconf -libs icu-io)
+CC=gcc
+
+all: haskell-read-utf8 icu-read-utf8 bench
+.PHONY: all
+
+bench: haskell-read-utf8 icu-read-utf8
+	hyperfine ./haskell-read-utf8 ./icu-read-utf8
+.PHONE: bench
+
+haskell-read-utf8: ./haskell/HaskellReadUTF8.hs
+	ghc -o ./haskell-read-utf8 -O ./haskell/HaskellReadUTF8.hs
+
+icu-read-utf8: ./icu/icu-read-utf8.c
+	$(CC) $(CFLAGS) $(LDFLAGS) ./icu/icu-read-utf8.c -o ./icu-read-utf8
+
+clean:
+	rm -f ./haskell/*.{o,hi}
+	rm -f ./haskell-read-utf8
+	rm -f ./icu-read-utf8
+.PHONY: clean
--- a/default.nix
+++ b/default.nix
@ -0,0 +1,24 @@
+{ pkgs ? import <nixpkgs> {} }:
+
+let ghcClosure =
+  pkgs.haskellPackages.ghcWithPackages
+  (p:[
+    p.text
+  ]);
+
+in pkgs.stdenv.mkDerivation {
+  pname = "bench-my-utf8";
+  version = "0.0.1";
+  installPhase = ''
+    mkdir -p $out/bin
+    mv haskell-read-utf8 $out/bin
+  '';
+  nativeBuildInputs = [
+    pkgs.gnumake
+    ghcClosure
+    pkgs.gcc
+    pkgs.icu
+    pkgs.hyperfine
+    pkgs.pkgconf
+  ];
+}
--- a/haskell/HaskellReadUTF8.hs
+++ b/haskell/HaskellReadUTF8.hs
@ -0,0 +1,7 @@
+module Main where
+
+import qualified Data.Text.IO as TIO
+import qualified Data.Text as T
+
+main :: IO ()
+main = TIO.readFile "text-test-data/english.txt" >> pure ()
--- a/icu/icu-read-utf8.c
+++ b/icu/icu-read-utf8.c
@ -0,0 +1,11 @@
+#include <unicode/ustdio.h>
+#include <stdlib.h>
+#include <unicode/umachine.h>
+
+int main(void)
+{
+    UFILE *in = u_fopen("text-test-data/english.txt", "r", NULL, "UTF-8");
+    // Using a 10 MB buffer for now. It fits all the corpus I want to test agaisnt.
+    UChar *charBuff = malloc(10000000 * sizeof(UChar));
+    int32_t i = u_file_read(charBuff, 10000000, in);
+}
--- a/readme.md
+++ b/readme.md
@ -0,0 +1,50 @@
+# UTF-8 Benchmark
+
+The idea is to benchmark several utf-8 aspects of the Haskell Text package. Namely utf-8 encoding/decoding and various unicode casing operations. Hopefully, we'll find ways to improve text's performance.
+
+For the time being, we're comparing the Text implementation with the C ICU one. In the future I also plan to test it against C++ Boost and the Rust stdlib.
+
+# Implemented so far
+
+- [x] UTF-8 decoding from file.
+  - [x] English
+  - [ ] Chinese
+  - [ ] French
+  - [ ] Russian
+- [ ] UTF-8 encoding to file.
+  - [ ] English
+  - [ ] Chinese
+  - [ ] French
+  - [ ] Russian
+- [ ] UTF-8 encoding to file.
+  - [ ] English
+  - [ ] Chinese
+  - [ ] French
+  - [ ] Russian
+
+
+# Usage
+
+```bash
+nix-shell
+make all
+```
+
+# Findings
+
+## UTF-8 Decoding
+
+```
+hyperfine ./haskell-read-utf8 ./icu-read-utf8
+Benchmark #1: ./haskell-read-utf8
+  Time (mean ± σ):      23.3 ms ±   0.9 ms    [User: 14.9 ms, System: 8.3 ms]
+  Range (min … max):    22.0 ms …  26.1 ms    111 runs
+
+Benchmark #2: ./icu-read-utf8
+  Time (mean ± σ):      12.5 ms ±   0.8 ms    [User: 7.6 ms, System: 4.9 ms]
+  Range (min … max):    11.5 ms …  16.1 ms    176 runs
+
+Summary
+  './icu-read-utf8' ran
+    1.85 ± 0.14 times faster than './haskell-read-utf8'
+```