Initial benchmark: comparing haskell's Text with icu on EN corpus
Run `make all` to run the benchmark.
This commit is contained in:
commit
d4488c5096
|
@ -0,0 +1,22 @@
|
||||||
|
CFLAGS=-O2 -Wall $(shell pkgconf -cflags icu-io)
|
||||||
|
LDFLAGS=$(shell pkgconf -libs icu-io)
|
||||||
|
CC=gcc
|
||||||
|
|
||||||
|
all: haskell-read-utf8 icu-read-utf8 bench
|
||||||
|
.PHONY: all
|
||||||
|
|
||||||
|
bench: haskell-read-utf8 icu-read-utf8
|
||||||
|
hyperfine ./haskell-read-utf8 ./icu-read-utf8
|
||||||
|
.PHONE: bench
|
||||||
|
|
||||||
|
haskell-read-utf8: ./haskell/HaskellReadUTF8.hs
|
||||||
|
ghc -o ./haskell-read-utf8 -O ./haskell/HaskellReadUTF8.hs
|
||||||
|
|
||||||
|
icu-read-utf8: ./icu/icu-read-utf8.c
|
||||||
|
$(CC) $(CFLAGS) $(LDFLAGS) ./icu/icu-read-utf8.c -o ./icu-read-utf8
|
||||||
|
|
||||||
|
clean:
|
||||||
|
rm -f ./haskell/*.{o,hi}
|
||||||
|
rm -f ./haskell-read-utf8
|
||||||
|
rm -f ./icu-read-utf8
|
||||||
|
.PHONY: clean
|
|
@ -0,0 +1,24 @@
|
||||||
|
{ pkgs ? import <nixpkgs> {} }:
|
||||||
|
|
||||||
|
let ghcClosure =
|
||||||
|
pkgs.haskellPackages.ghcWithPackages
|
||||||
|
(p:[
|
||||||
|
p.text
|
||||||
|
]);
|
||||||
|
|
||||||
|
in pkgs.stdenv.mkDerivation {
|
||||||
|
pname = "bench-my-utf8";
|
||||||
|
version = "0.0.1";
|
||||||
|
installPhase = ''
|
||||||
|
mkdir -p $out/bin
|
||||||
|
mv haskell-read-utf8 $out/bin
|
||||||
|
'';
|
||||||
|
nativeBuildInputs = [
|
||||||
|
pkgs.gnumake
|
||||||
|
ghcClosure
|
||||||
|
pkgs.gcc
|
||||||
|
pkgs.icu
|
||||||
|
pkgs.hyperfine
|
||||||
|
pkgs.pkgconf
|
||||||
|
];
|
||||||
|
}
|
|
@ -0,0 +1,7 @@
|
||||||
|
module Main where
|
||||||
|
|
||||||
|
import qualified Data.Text.IO as TIO
|
||||||
|
import qualified Data.Text as T
|
||||||
|
|
||||||
|
main :: IO ()
|
||||||
|
main = TIO.readFile "text-test-data/english.txt" >> pure ()
|
|
@ -0,0 +1,11 @@
|
||||||
|
#include <unicode/ustdio.h>
|
||||||
|
#include <stdlib.h>
|
||||||
|
#include <unicode/umachine.h>
|
||||||
|
|
||||||
|
int main(void)
|
||||||
|
{
|
||||||
|
UFILE *in = u_fopen("text-test-data/english.txt", "r", NULL, "UTF-8");
|
||||||
|
// Using a 10 MB buffer for now. It fits all the corpus I want to test agaisnt.
|
||||||
|
UChar *charBuff = malloc(10000000 * sizeof(UChar));
|
||||||
|
int32_t i = u_file_read(charBuff, 10000000, in);
|
||||||
|
}
|
|
@ -0,0 +1,50 @@
|
||||||
|
# UTF-8 Benchmark
|
||||||
|
|
||||||
|
The idea is to benchmark several utf-8 aspects of the Haskell Text package. Namely utf-8 encoding/decoding and various unicode casing operations. Hopefully, we'll find ways to improve text's performance.
|
||||||
|
|
||||||
|
For the time being, we're comparing the Text implementation with the C ICU one. In the future I also plan to test it against C++ Boost and the Rust stdlib.
|
||||||
|
|
||||||
|
# Implemented so far
|
||||||
|
|
||||||
|
- [x] UTF-8 decoding from file.
|
||||||
|
- [x] English
|
||||||
|
- [ ] Chinese
|
||||||
|
- [ ] French
|
||||||
|
- [ ] Russian
|
||||||
|
- [ ] UTF-8 encoding to file.
|
||||||
|
- [ ] English
|
||||||
|
- [ ] Chinese
|
||||||
|
- [ ] French
|
||||||
|
- [ ] Russian
|
||||||
|
- [ ] UTF-8 encoding to file.
|
||||||
|
- [ ] English
|
||||||
|
- [ ] Chinese
|
||||||
|
- [ ] French
|
||||||
|
- [ ] Russian
|
||||||
|
|
||||||
|
|
||||||
|
# Usage
|
||||||
|
|
||||||
|
```bash
|
||||||
|
nix-shell
|
||||||
|
make all
|
||||||
|
```
|
||||||
|
|
||||||
|
# Findings
|
||||||
|
|
||||||
|
## UTF-8 Decoding
|
||||||
|
|
||||||
|
```
|
||||||
|
hyperfine ./haskell-read-utf8 ./icu-read-utf8
|
||||||
|
Benchmark #1: ./haskell-read-utf8
|
||||||
|
Time (mean ± σ): 23.3 ms ± 0.9 ms [User: 14.9 ms, System: 8.3 ms]
|
||||||
|
Range (min … max): 22.0 ms … 26.1 ms 111 runs
|
||||||
|
|
||||||
|
Benchmark #2: ./icu-read-utf8
|
||||||
|
Time (mean ± σ): 12.5 ms ± 0.8 ms [User: 7.6 ms, System: 4.9 ms]
|
||||||
|
Range (min … max): 11.5 ms … 16.1 ms 176 runs
|
||||||
|
|
||||||
|
Summary
|
||||||
|
'./icu-read-utf8' ran
|
||||||
|
1.85 ± 0.14 times faster than './haskell-read-utf8'
|
||||||
|
```
|
Loading…
Reference in New Issue