Compare commits

...

1 Commits

Author SHA1 Message Date
Félix Baylac-Jacqué 7a43fdcdd6 "native" SQLite insertion.
We try to insert the rows directly from the Rust software. Sadly at a massive
perf cost :(

real	2m14,645s
user	1m1,134s
sys	1m15,060s

We seem to be CPU-bound during the DB generation. I tried disabling the
insert journal *and* the sync pragma. I have several hypothesis
explaining this slowdown:

1. The pragmas are getting incorrectly set.
2. I'm missing some necessary pragmas.

To move forward, I think I should check what kind of pragma the native
CSV sqlite import uses and somehow dump all the active pragmas
somewhere.
2022-02-11 11:07:17 +01:00
4 changed files with 206 additions and 11 deletions

140
Cargo.lock generated
View File

@ -2,6 +2,146 @@
# It is not intended for manual editing.
version = 3
[[package]]
name = "ahash"
version = "0.7.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "fcb51a0695d8f838b1ee009b3fbf66bda078cd64590202a864a8f3e8c4315c47"
dependencies = [
"getrandom",
"once_cell",
"version_check",
]
[[package]]
name = "bitflags"
version = "1.3.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a"
[[package]]
name = "cfg-if"
version = "1.0.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd"
[[package]]
name = "fallible-iterator"
version = "0.2.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "4443176a9f2c162692bd3d352d745ef9413eec5782a80d8fd6f8a1ac692a07f7"
[[package]]
name = "fallible-streaming-iterator"
version = "0.1.9"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7360491ce676a36bf9bb3c56c1aa791658183a54d2744120f27285738d90465a"
[[package]]
name = "fast-fantoir"
version = "0.1.0"
dependencies = [
"rusqlite",
]
[[package]]
name = "getrandom"
version = "0.2.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "418d37c8b1d42553c93648be529cb70f920d3baf8ef469b74b9638df426e0b4c"
dependencies = [
"cfg-if",
"libc",
"wasi",
]
[[package]]
name = "hashbrown"
version = "0.11.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ab5ef0d4909ef3724cc8cce6ccc8572c5c817592e9285f5464f8e86f8bd3726e"
dependencies = [
"ahash",
]
[[package]]
name = "hashlink"
version = "0.7.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7249a3129cbc1ffccd74857f81464a323a152173cdb134e0fd81bc803b29facf"
dependencies = [
"hashbrown",
]
[[package]]
name = "libc"
version = "0.2.117"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e74d72e0f9b65b5b4ca49a346af3976df0f9c61d550727f349ecd559f251a26c"
[[package]]
name = "libsqlite3-sys"
version = "0.23.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d2cafc7c74096c336d9d27145f7ebd4f4b6f95ba16aa5a282387267e6925cb58"
dependencies = [
"pkg-config",
"vcpkg",
]
[[package]]
name = "memchr"
version = "2.4.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "308cc39be01b73d0d18f82a0e7b2a3df85245f84af96fdddc5d202d27e47b86a"
[[package]]
name = "once_cell"
version = "1.9.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "da32515d9f6e6e489d7bc9d84c71b060db7247dc035bbe44eac88cf87486d8d5"
[[package]]
name = "pkg-config"
version = "0.3.24"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "58893f751c9b0412871a09abd62ecd2a00298c6c83befa223ef98c52aef40cbe"
[[package]]
name = "rusqlite"
version = "0.26.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "4ba4d3462c8b2e4d7f4fcfcf2b296dc6b65404fbbc7b63daa37fd485c149daf7"
dependencies = [
"bitflags",
"fallible-iterator",
"fallible-streaming-iterator",
"hashlink",
"libsqlite3-sys",
"memchr",
"smallvec",
]
[[package]]
name = "smallvec"
version = "1.8.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f2dd574626839106c320a323308629dcb1acfc96e32a8cba364ddc61ac23ee83"
[[package]]
name = "vcpkg"
version = "0.2.15"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "accd4ea62f7bb7a82fe23066fb0957d48ef677f6eeb8215f372f52e48bb32426"
[[package]]
name = "version_check"
version = "0.9.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "49874b5167b65d7193b8aba1567f5c7d93d001cafc34600cee003eda787e483f"
[[package]]
name = "wasi"
version = "0.10.2+wasi-snapshot-preview1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "fd6fbd9a79829dd1ad0cc20627bf1ed606756a7f77edff7b66b7064f9cb327c6"

View File

@ -5,4 +5,6 @@ edition = "2018"
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
[dependencies]
[dependencies]
# Tested with rusqlite 0.26.3
rusqlite = "*"

View File

@ -26,3 +26,7 @@ CREATE TABLE IF NOT EXISTS "fantoir"(
"libelle" TEXT
);
```
## Data
You can download the latest FANTOIR data over there: https://www.data.gouv.fr/fr/datasets/fichier-fantoir-des-voies-et-lieux-dits/

View File

@ -1,14 +1,45 @@
use std::fs::File;
use std::io::{BufReader, BufRead};
fn main() -> std::io::Result<()> {
let fantoir_path = std::env::args().nth(1).unwrap();
let file = match File::open(&fantoir_path) {
Err(err) => panic!("Cannot read file {}: {}", fantoir_path, err),
Ok(file) => file,
};
let reader = BufReader::new(file);
println!("full_insee;rivoli_with_key;libelle");
use rusqlite::{Connection, Result, Statement, params};
/// Create a new SQLite DB and sets the appropriate pragmas.
///
/// This is a one time batch import. We don't care about atomicity
/// here. If the batch fails, then we'll start it over from scratch.
/// Disabling the rollback journal speeds up quite a lot the inserts.
///
///
fn conn_db (path: &str) -> Result<Connection> {
let db = Connection::open(path)?;
db.pragma_update(None,"journal_mode","OFF")?;
db.pragma_update(None,"synchronous","OFF")?;
db.execute(
"CREATE TABLE fantoir (
id INTEGER PRIMARY KEY,
rivoli TEXT NOT NULL,
insee TEXT NOT NULL,
libelle TEXT NOT NULL
)", []
)?;
Ok(db)
}
/// Parsing a FANTOIR file. This is where all the business logic lives.
///
/// # Parsing logic
///
/// For each line, we try to figure out what kind of record type we're
/// looking at. We're using some of the FANTOIR specificities to do so.
///
/// 1. If the 3rd char is empty, we can assume it's a "Département"
/// record.
/// 2. If the 3rd & 6th char are empty, it's a "Commune".
/// 3. All the other record will be street records. In that case, we
/// can extract the insee code, the rivoli code (with the key) and
/// the street name (aka libelle).
#[inline]
fn parse_fantoir_lines(reader: BufReader<File>, mut stmt: Statement) -> () {
for line in reader.lines() {
let l = line.unwrap();
if l.chars().nth(3) == Some(' ') {
@ -19,9 +50,27 @@ fn main() -> std::io::Result<()> {
let mut full_insee = String::from(l.get(0..2).unwrap());
full_insee.push_str(l.get(3..6).unwrap());
let rivoli_with_key = l.get(6..11).unwrap();
let libelle = l.get(15..41).unwrap();
println!("{};{};{}", full_insee, rivoli_with_key, libelle);
let libelle = l.get(15..41).unwrap().trim();
stmt.execute(params! [rivoli_with_key, full_insee, libelle]).unwrap();
}
};
}
fn main() -> std::io::Result<()> {
let db = match conn_db("./fantoir.sqlite") {
Err(err) => panic!("Cannot close DB: {}", err),
Ok(conn) => conn
};
let stmt = db.prepare(
"INSERT INTO fantoir (rivoli, insee, libelle) VALUES (?1, ?2, ?3)"
).unwrap();
let fantoir_path = std::env::args().nth(1).unwrap();
let file = match File::open(&fantoir_path) {
Err(err) => panic!("Cannot read file {}: {}", fantoir_path, err),
Ok(file) => file,
};
let reader = BufReader::new(file);
parse_fantoir_lines(reader, stmt);
db.close();
Ok(())
}