From 7a43fdcdd61f5979db3e03d5d586e4c279fb2eb4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?F=C3=A9lix=20Baylac-Jacqu=C3=A9?= Date: Fri, 11 Feb 2022 11:07:17 +0100 Subject: [PATCH] "native" SQLite insertion. We try to insert the rows directly from the Rust software. Sadly at a massive perf cost :( real 2m14,645s user 1m1,134s sys 1m15,060s We seem to be CPU-bound during the DB generation. I tried disabling the insert journal *and* the sync pragma. I have several hypothesis explaining this slowdown: 1. The pragmas are getting incorrectly set. 2. I'm missing some necessary pragmas. To move forward, I think I should check what kind of pragma the native CSV sqlite import uses and somehow dump all the active pragmas somewhere. --- Cargo.lock | 140 ++++++++++++++++++++++++++++++++++++++++++++++++++++ Cargo.toml | 4 +- README.md | 4 ++ src/main.rs | 69 ++++++++++++++++++++++---- 4 files changed, 206 insertions(+), 11 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 2bc6de9..7f15490 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2,6 +2,146 @@ # It is not intended for manual editing. version = 3 +[[package]] +name = "ahash" +version = "0.7.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fcb51a0695d8f838b1ee009b3fbf66bda078cd64590202a864a8f3e8c4315c47" +dependencies = [ + "getrandom", + "once_cell", + "version_check", +] + +[[package]] +name = "bitflags" +version = "1.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a" + +[[package]] +name = "cfg-if" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd" + +[[package]] +name = "fallible-iterator" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4443176a9f2c162692bd3d352d745ef9413eec5782a80d8fd6f8a1ac692a07f7" + +[[package]] +name = "fallible-streaming-iterator" +version = "0.1.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7360491ce676a36bf9bb3c56c1aa791658183a54d2744120f27285738d90465a" + [[package]] name = "fast-fantoir" version = "0.1.0" +dependencies = [ + "rusqlite", +] + +[[package]] +name = "getrandom" +version = "0.2.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "418d37c8b1d42553c93648be529cb70f920d3baf8ef469b74b9638df426e0b4c" +dependencies = [ + "cfg-if", + "libc", + "wasi", +] + +[[package]] +name = "hashbrown" +version = "0.11.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ab5ef0d4909ef3724cc8cce6ccc8572c5c817592e9285f5464f8e86f8bd3726e" +dependencies = [ + "ahash", +] + +[[package]] +name = "hashlink" +version = "0.7.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7249a3129cbc1ffccd74857f81464a323a152173cdb134e0fd81bc803b29facf" +dependencies = [ + "hashbrown", +] + +[[package]] +name = "libc" +version = "0.2.117" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e74d72e0f9b65b5b4ca49a346af3976df0f9c61d550727f349ecd559f251a26c" + +[[package]] +name = "libsqlite3-sys" +version = "0.23.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d2cafc7c74096c336d9d27145f7ebd4f4b6f95ba16aa5a282387267e6925cb58" +dependencies = [ + "pkg-config", + "vcpkg", +] + +[[package]] +name = "memchr" +version = "2.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "308cc39be01b73d0d18f82a0e7b2a3df85245f84af96fdddc5d202d27e47b86a" + +[[package]] +name = "once_cell" +version = "1.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "da32515d9f6e6e489d7bc9d84c71b060db7247dc035bbe44eac88cf87486d8d5" + +[[package]] +name = "pkg-config" +version = "0.3.24" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "58893f751c9b0412871a09abd62ecd2a00298c6c83befa223ef98c52aef40cbe" + +[[package]] +name = "rusqlite" +version = "0.26.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4ba4d3462c8b2e4d7f4fcfcf2b296dc6b65404fbbc7b63daa37fd485c149daf7" +dependencies = [ + "bitflags", + "fallible-iterator", + "fallible-streaming-iterator", + "hashlink", + "libsqlite3-sys", + "memchr", + "smallvec", +] + +[[package]] +name = "smallvec" +version = "1.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f2dd574626839106c320a323308629dcb1acfc96e32a8cba364ddc61ac23ee83" + +[[package]] +name = "vcpkg" +version = "0.2.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "accd4ea62f7bb7a82fe23066fb0957d48ef677f6eeb8215f372f52e48bb32426" + +[[package]] +name = "version_check" +version = "0.9.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "49874b5167b65d7193b8aba1567f5c7d93d001cafc34600cee003eda787e483f" + +[[package]] +name = "wasi" +version = "0.10.2+wasi-snapshot-preview1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fd6fbd9a79829dd1ad0cc20627bf1ed606756a7f77edff7b66b7064f9cb327c6" diff --git a/Cargo.toml b/Cargo.toml index c579e29..815cb8a 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -5,4 +5,6 @@ edition = "2018" # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html -[dependencies] \ No newline at end of file +[dependencies] +# Tested with rusqlite 0.26.3 +rusqlite = "*" \ No newline at end of file diff --git a/README.md b/README.md index 952face..22fd9af 100644 --- a/README.md +++ b/README.md @@ -26,3 +26,7 @@ CREATE TABLE IF NOT EXISTS "fantoir"( "libelle" TEXT ); ``` + +## Data + +You can download the latest FANTOIR data over there: https://www.data.gouv.fr/fr/datasets/fichier-fantoir-des-voies-et-lieux-dits/ diff --git a/src/main.rs b/src/main.rs index 8222816..5220b24 100644 --- a/src/main.rs +++ b/src/main.rs @@ -1,14 +1,45 @@ use std::fs::File; use std::io::{BufReader, BufRead}; -fn main() -> std::io::Result<()> { - let fantoir_path = std::env::args().nth(1).unwrap(); - let file = match File::open(&fantoir_path) { - Err(err) => panic!("Cannot read file {}: {}", fantoir_path, err), - Ok(file) => file, - }; - let reader = BufReader::new(file); - println!("full_insee;rivoli_with_key;libelle"); +use rusqlite::{Connection, Result, Statement, params}; + +/// Create a new SQLite DB and sets the appropriate pragmas. +/// +/// This is a one time batch import. We don't care about atomicity +/// here. If the batch fails, then we'll start it over from scratch. +/// Disabling the rollback journal speeds up quite a lot the inserts. +/// +/// +fn conn_db (path: &str) -> Result { + let db = Connection::open(path)?; + db.pragma_update(None,"journal_mode","OFF")?; + db.pragma_update(None,"synchronous","OFF")?; + db.execute( + "CREATE TABLE fantoir ( + id INTEGER PRIMARY KEY, + rivoli TEXT NOT NULL, + insee TEXT NOT NULL, + libelle TEXT NOT NULL + )", [] + )?; + Ok(db) +} + +/// Parsing a FANTOIR file. This is where all the business logic lives. +/// +/// # Parsing logic +/// +/// For each line, we try to figure out what kind of record type we're +/// looking at. We're using some of the FANTOIR specificities to do so. +/// +/// 1. If the 3rd char is empty, we can assume it's a "Département" +/// record. +/// 2. If the 3rd & 6th char are empty, it's a "Commune". +/// 3. All the other record will be street records. In that case, we +/// can extract the insee code, the rivoli code (with the key) and +/// the street name (aka libelle). +#[inline] +fn parse_fantoir_lines(reader: BufReader, mut stmt: Statement) -> () { for line in reader.lines() { let l = line.unwrap(); if l.chars().nth(3) == Some(' ') { @@ -19,9 +50,27 @@ fn main() -> std::io::Result<()> { let mut full_insee = String::from(l.get(0..2).unwrap()); full_insee.push_str(l.get(3..6).unwrap()); let rivoli_with_key = l.get(6..11).unwrap(); - let libelle = l.get(15..41).unwrap(); - println!("{};{};{}", full_insee, rivoli_with_key, libelle); + let libelle = l.get(15..41).unwrap().trim(); + stmt.execute(params! [rivoli_with_key, full_insee, libelle]).unwrap(); } }; +} + +fn main() -> std::io::Result<()> { + let db = match conn_db("./fantoir.sqlite") { + Err(err) => panic!("Cannot close DB: {}", err), + Ok(conn) => conn + }; + let stmt = db.prepare( + "INSERT INTO fantoir (rivoli, insee, libelle) VALUES (?1, ?2, ?3)" + ).unwrap(); + let fantoir_path = std::env::args().nth(1).unwrap(); + let file = match File::open(&fantoir_path) { + Err(err) => panic!("Cannot read file {}: {}", fantoir_path, err), + Ok(file) => file, + }; + let reader = BufReader::new(file); + parse_fantoir_lines(reader, stmt); + db.close(); Ok(()) }