Compare commits

..

1 Commits

Author SHA1 Message Date
Félix Baylac-Jacqué 7a43fdcdd6 "native" SQLite insertion.
We try to insert the rows directly from the Rust software. Sadly at a massive
perf cost :(

real	2m14,645s
user	1m1,134s
sys	1m15,060s

We seem to be CPU-bound during the DB generation. I tried disabling the
insert journal *and* the sync pragma. I have several hypothesis
explaining this slowdown:

1. The pragmas are getting incorrectly set.
2. I'm missing some necessary pragmas.

To move forward, I think I should check what kind of pragma the native
CSV sqlite import uses and somehow dump all the active pragmas
somewhere.
2022-02-11 11:07:17 +01:00
5 changed files with 208 additions and 213 deletions

140
Cargo.lock generated
View File

@ -2,6 +2,146 @@
# It is not intended for manual editing.
version = 3
[[package]]
name = "ahash"
version = "0.7.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "fcb51a0695d8f838b1ee009b3fbf66bda078cd64590202a864a8f3e8c4315c47"
dependencies = [
"getrandom",
"once_cell",
"version_check",
]
[[package]]
name = "bitflags"
version = "1.3.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a"
[[package]]
name = "cfg-if"
version = "1.0.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd"
[[package]]
name = "fallible-iterator"
version = "0.2.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "4443176a9f2c162692bd3d352d745ef9413eec5782a80d8fd6f8a1ac692a07f7"
[[package]]
name = "fallible-streaming-iterator"
version = "0.1.9"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7360491ce676a36bf9bb3c56c1aa791658183a54d2744120f27285738d90465a"
[[package]]
name = "fast-fantoir"
version = "0.1.0"
dependencies = [
"rusqlite",
]
[[package]]
name = "getrandom"
version = "0.2.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "418d37c8b1d42553c93648be529cb70f920d3baf8ef469b74b9638df426e0b4c"
dependencies = [
"cfg-if",
"libc",
"wasi",
]
[[package]]
name = "hashbrown"
version = "0.11.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ab5ef0d4909ef3724cc8cce6ccc8572c5c817592e9285f5464f8e86f8bd3726e"
dependencies = [
"ahash",
]
[[package]]
name = "hashlink"
version = "0.7.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7249a3129cbc1ffccd74857f81464a323a152173cdb134e0fd81bc803b29facf"
dependencies = [
"hashbrown",
]
[[package]]
name = "libc"
version = "0.2.117"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e74d72e0f9b65b5b4ca49a346af3976df0f9c61d550727f349ecd559f251a26c"
[[package]]
name = "libsqlite3-sys"
version = "0.23.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d2cafc7c74096c336d9d27145f7ebd4f4b6f95ba16aa5a282387267e6925cb58"
dependencies = [
"pkg-config",
"vcpkg",
]
[[package]]
name = "memchr"
version = "2.4.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "308cc39be01b73d0d18f82a0e7b2a3df85245f84af96fdddc5d202d27e47b86a"
[[package]]
name = "once_cell"
version = "1.9.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "da32515d9f6e6e489d7bc9d84c71b060db7247dc035bbe44eac88cf87486d8d5"
[[package]]
name = "pkg-config"
version = "0.3.24"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "58893f751c9b0412871a09abd62ecd2a00298c6c83befa223ef98c52aef40cbe"
[[package]]
name = "rusqlite"
version = "0.26.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "4ba4d3462c8b2e4d7f4fcfcf2b296dc6b65404fbbc7b63daa37fd485c149daf7"
dependencies = [
"bitflags",
"fallible-iterator",
"fallible-streaming-iterator",
"hashlink",
"libsqlite3-sys",
"memchr",
"smallvec",
]
[[package]]
name = "smallvec"
version = "1.8.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f2dd574626839106c320a323308629dcb1acfc96e32a8cba364ddc61ac23ee83"
[[package]]
name = "vcpkg"
version = "0.2.15"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "accd4ea62f7bb7a82fe23066fb0957d48ef677f6eeb8215f372f52e48bb32426"
[[package]]
name = "version_check"
version = "0.9.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "49874b5167b65d7193b8aba1567f5c7d93d001cafc34600cee003eda787e483f"
[[package]]
name = "wasi"
version = "0.10.2+wasi-snapshot-preview1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "fd6fbd9a79829dd1ad0cc20627bf1ed606756a7f77edff7b66b7064f9cb327c6"

View File

@ -5,4 +5,6 @@ edition = "2018"
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
[dependencies]
[dependencies]
# Tested with rusqlite 0.26.3
rusqlite = "*"

View File

@ -26,3 +26,7 @@ CREATE TABLE IF NOT EXISTS "fantoir"(
"libelle" TEXT
);
```
## Data
You can download the latest FANTOIR data over there: https://www.data.gouv.fr/fr/datasets/fichier-fantoir-des-voies-et-lieux-dits/

View File

@ -17,16 +17,14 @@ trap clean_tmp EXIT
tmpCsv="${tmpDir}"/fantoir.csv
tmpSql="${tmpDir}"/import-fantoir.sql
echo "[+] Generating fantoir CSV"
echo "Generating fantoir CSV"
cargo run --release -- "$1" > "${tmpCsv}"
echo "[+] Generating fantoir SQLite DB"
echo "Generating fantoir SQLite DB"
cat >"${tmpSql}" <<EOF
.separator ";"
.import ${tmpCsv} fantoir
EOF
sqlite3 fantoir.sqlite < "${tmpSql}"
echo "[+] Populating DB index"
echo "CREATE INDEX insee_fantoir ON fantoir(full_insee);" | sqlite3 fantoir.sqlite
echo "[+] DB generated at $(pwd)/fantoir.sqlite"
echo "DB generated at $(pwd)/fantoir.sqlite"

View File

@ -1,14 +1,45 @@
use std::fs::File;
use std::io::{BufReader, BufRead};
fn main() -> std::io::Result<()> {
let fantoir_path = std::env::args().nth(1).unwrap();
let file = match File::open(&fantoir_path) {
Err(err) => panic!("Cannot read file {}: {}", fantoir_path, err),
Ok(file) => file,
};
let reader = BufReader::new(file);
println!("full_insee;rivoli_with_key;libelle");
use rusqlite::{Connection, Result, Statement, params};
/// Create a new SQLite DB and sets the appropriate pragmas.
///
/// This is a one time batch import. We don't care about atomicity
/// here. If the batch fails, then we'll start it over from scratch.
/// Disabling the rollback journal speeds up quite a lot the inserts.
///
///
fn conn_db (path: &str) -> Result<Connection> {
let db = Connection::open(path)?;
db.pragma_update(None,"journal_mode","OFF")?;
db.pragma_update(None,"synchronous","OFF")?;
db.execute(
"CREATE TABLE fantoir (
id INTEGER PRIMARY KEY,
rivoli TEXT NOT NULL,
insee TEXT NOT NULL,
libelle TEXT NOT NULL
)", []
)?;
Ok(db)
}
/// Parsing a FANTOIR file. This is where all the business logic lives.
///
/// # Parsing logic
///
/// For each line, we try to figure out what kind of record type we're
/// looking at. We're using some of the FANTOIR specificities to do so.
///
/// 1. If the 3rd char is empty, we can assume it's a "Département"
/// record.
/// 2. If the 3rd & 6th char are empty, it's a "Commune".
/// 3. All the other record will be street records. In that case, we
/// can extract the insee code, the rivoli code (with the key) and
/// the street name (aka libelle).
#[inline]
fn parse_fantoir_lines(reader: BufReader<File>, mut stmt: Statement) -> () {
for line in reader.lines() {
let l = line.unwrap();
if l.chars().nth(3) == Some(' ') {
@ -19,207 +50,27 @@ fn main() -> std::io::Result<()> {
let mut full_insee = String::from(l.get(0..2).unwrap());
full_insee.push_str(l.get(3..6).unwrap());
let rivoli_with_key = l.get(6..11).unwrap();
let street_type = street_type(l.get(11..15).unwrap());
let libelle = l.get(15..41).unwrap();
println!("{};{};{}{}", full_insee, rivoli_with_key, street_type, libelle);
let libelle = l.get(15..41).unwrap().trim();
stmt.execute(params! [rivoli_with_key, full_insee, libelle]).unwrap();
}
};
Ok(())
}
#[inline]
fn street_type(street_type: &str) -> &str {
match street_type {
"ACH " => "ANCIEN CHEMIN ",
"AER " => "AERODROME ",
"AERG" => "AEROGARE ",
"AGL " => "AGGLOMERATION ",
"AIRE" => "AIRE ",
"ALL " => "ALLEE ",
"ANGL" => "ANGLE ",
"ARC " => "ARCADE ",
"ART " => "ANCIENNE ROUTE ",
"AUT " => "AUTOROUTE ",
"AV " => "AVENUE ",
"BASE" => "BASE ",
"BD " => "BOULEVARD ",
"BER " => "BERGE ",
"BORD" => "BORD ",
"BRE " => "BARRIERE ",
"BRG " => "BOURG ",
"BRTL" => "BRETELLE ",
"BSN " => "BASSIN ",
"CAE " => "CARRIERA ",
"CALL" => "CALLE, CALLADA ",
"CAMI" => "CAMIN ",
"CAMP" => "CAMP ",
"CAN " => "CANAL ",
"CAR " => "CARREFOUR ",
"CARE" => "CARRIERE ",
"CASR" => "CASERNE ",
"CC " => "CHEMIN COMMUNAL ",
"CD " => "CHEMIN DEPARTEMENTAL ",
"CF " => "CHEMIN FORESTIER ",
"CHA " => "CHASSE ",
"CHE " => "CHEMIN ",
"CHEM" => "CHEMINEMENT ",
"CHL " => "CHALET ",
"CHP " => "CHAMP ",
"CHS " => "CHAUSSEE ",
"CHT " => "CHATEAU ",
"CHV " => "CHEMIN VICINAL ",
"CITE" => "CITE ",
"CIVE" => "COURSIVE ",
"CLOS" => "CLOS ",
"CLR " => "COULOIR ",
"COIN" => "COIN ",
"COL " => "COL ",
"COR " => "CORNICHE ",
"CORO" => "CORON ",
"COTE" => "COTE ",
"COUR" => "COUR ",
"CPG " => "CAMPING ",
"CR " => "CHEMIN RURAL ",
"CRS " => "COURS ",
"CRX " => "CROIX ",
"CTR " => "CONTOUR ",
"CTRE" => "CENTRE ",
"DARS" => "DARSE, DARCE ",
"DEVI" => "DEVIATION ",
"DIG " => "DIGUE ",
"DOM " => "DOMAINE ",
"DRA " => "DRAILLE ",
"DSC " => "DESCENTE ",
"ECA " => "ECART ",
"ECL " => "ECLUSE ",
"EMBR" => "EMBRANCHEMENT ",
"EMP " => "EMPLACEMENT ",
"ENC " => "ENCLOS ",
"ENV " => "ENCLAVE ",
"ESC " => "ESCALIER ",
"ESP " => "ESPLANADE ",
"ESPA" => "ESPACE ",
"ETNG" => "ETANG ",
"FD " => "FOND ",
"FG " => "FAUBOURG ",
"FON " => "FONTAINE ",
"FOR " => "FORET ",
"FORT" => "FORT ",
"FOS " => "FOSSE ",
"FRM " => "FERME ",
"GAL " => "GALERIE ",
"GARE" => "GARE ",
"GBD " => "GRAND BOULEVARD ",
"GPL " => "GRANDE PLACE ",
"GR " => "GRANDE RUE ",
"GREV" => "GREVE ",
"HAB " => "HABITATION ",
"HAM " => "HAMEAU ",
"HIP " => "HIPPODROME ",
"HLE " => "HALLE ",
"HLG " => "HALAGE ",
"HLM " => "HLM ",
"HTR " => "HAUTEUR ",
"ILE " => "ILE ",
"ILOT" => "ILOT ",
"IMP " => "IMPASSE ",
"JARD" => "JARDIN ",
"JTE " => "JETEE ",
"LAC " => "LAC ",
"LEVE" => "LEVEE ",
"LICE" => "LICES ",
"LIGN" => "LIGNE ",
"LOT " => "LOTISSEMENT ",
"MAIL" => "MAIL ",
"MAIS" => "MAISON ",
"MAR " => "MARCHE ",
"MARE" => "MARE ",
"MAS " => "MAS ",
"MNE " => "MORNE ",
"MRN " => "MARINA ",
"MTE " => "MONTEE ",
"NTE " => "NOUVELLE ROUTE ",
"PAE " => "PETITE AVENUE ",
"PARC" => "PARC ",
"PAS " => "PASSAGE ",
"PASS" => "PASSE ",
"PCH " => "PETIT CHEMIN ",
"PCHE" => "PORCHE ",
"PHAR" => "PHARE ",
"PIST" => "PISTE ",
"PKG " => "PARKING ",
"PL " => "PLACE ",
"PLA " => "PLACA ",
"PLAG" => "PLAGE ",
"PLAN" => "PLAN ",
"PLCI" => "PLACIS ",
"PLE " => "PASSERELLE ",
"PLN " => "PLAINE ",
"PLT " => "PLATEAU ",
"PNT " => "POINTE ",
"PONT" => "PONT ",
"PORQ" => "PORTIQUE ",
"PORT" => "PORT ",
"POST" => "POSTE ",
"POT " => "POTERNE ",
"PROM" => "PROMENADE ",
"PRT " => "PETITE, ROUTE ",
"PRV " => "PARVIS ",
"PTA " => "PETITE ALLEE ",
"PTE " => "PORTE ",
"PTR " => "PETITE RUE ",
"PTTE" => "PLACETTE ",
"QUA " => "QUARTIER ",
"QUAI" => "QUAI ",
"RAC " => "RACCOURCI ",
"REM " => "REMPART ",
"RES " => "RESIDENCE ",
"RIVE" => "RIVE ",
"RLE " => "RUELLE ",
"ROC " => "ROCADE ",
"RPE " => "RAMPE ",
"RPT " => "ROND-POINT ",
"RTD " => "ROTONDE ",
"RTE " => "ROUTE ",
"RUE " => "RUE ",
"RUET" => "RUETTE ",
"RUIS" => "RUISSEAU ",
"RULT" => "RUELLETTE ",
"RVE " => "RAVINE ",
"SAS " => "SAS ",
"SEN " => "SENTIER, SENTE ",
"SQ " => "SQUARE ",
"STDE" => "STADE ",
"TER " => "TERRE ",
"TOUR" => "TOUR ",
"TPL " => "TERRE-PLEIN ",
"TRA " => "TRAVERSE ",
"TRAB" => "TRABOULE ",
"TRN " => "TERRAIN ",
"TRT " => "TERTRE ",
"TSSE" => "TERRASSE ",
"TUN " => "TUNNEL ",
"VAL " => "VAL ",
"VALL" => "VALLON, VALLEE ",
"VC " => "VOIE COMMUNALE ",
"VCHE" => "VIEUX CHEMIN ",
"VEN " => "VENELLE ",
"VGE " => "VILLAGE ",
"VIA " => "VIA ",
"VIAD" => "VIADUC ",
"VIL " => "VILLE ",
"VLA " => "VILLA ",
"VOIE" => "VOIE ",
"VOIR" => "VOIRIE ",
"VOUT" => "VOUTE ",
"VOY " => "VOYEUL ",
"VTE " => "VIEILLE ROUTE ",
"ZA " => "ZA ",
"ZAC " => "ZAC ",
"ZAD " => "ZAD ",
"ZI " => "ZI ",
"ZONE" => "ZONE ",
"ZUP " => "ZUP ",
_ => ""
}
fn main() -> std::io::Result<()> {
let db = match conn_db("./fantoir.sqlite") {
Err(err) => panic!("Cannot close DB: {}", err),
Ok(conn) => conn
};
let stmt = db.prepare(
"INSERT INTO fantoir (rivoli, insee, libelle) VALUES (?1, ?2, ?3)"
).unwrap();
let fantoir_path = std::env::args().nth(1).unwrap();
let file = match File::open(&fantoir_path) {
Err(err) => panic!("Cannot read file {}: {}", fantoir_path, err),
Ok(file) => file,
};
let reader = BufReader::new(file);
parse_fantoir_lines(reader, stmt);
db.close();
Ok(())
}