commit 8dd5cb8dbb1da23841059b86a008283209e1ad12 Author: Félix Baylac-Jacqué Date: Sun Jan 17 16:34:53 2021 +0100 Initial spreadsheet importer for Arkhéia Pretty crude implementation of a Libre Office Spreadsheet => Arkhéia DB. The Arkhéia format is totally bonkers. This implementation has been tested with a pretty small sample file. While it does seem to work, I'm still not 100% this will correctly scale on a larger import sample. Let's hope for the best and fix stuff along the way :) diff --git a/butcher-xml b/butcher-xml new file mode 100755 index 0000000..e5080e0 --- /dev/null +++ b/butcher-xml @@ -0,0 +1,21 @@ +#!/usr/bin/env bash + +# Arkeia is expecting a BOM at the front utf-8 file, AKA. exactly what +# the unicode spec tells you NOT TO DO... (W-T-F!!!!) + +# If you miss the BOM, the file will be considered being ACII and +# screwing your accents... + +# Add BOM +printf '\xEF\xBB\xBF' > $2 + +# Ah, yeah. Arkeia is also not expencting to get a valid XML but a +# *really* weird format instead. Basically they expect a set of XML +# elements for each entry. The entries being separated by a newline. + +# Butching the XML file into something Arkeia will injest.... +# In no particular order: +# - Removing node. +# - Removing nodes. +# - Separating the entries by a newline. +xmllint --format $1 | sed '/root/d' | sed '/entry/d' | sed '/xml/d' | awk '{$1=$1};1' | tr -d '\n' | sed 's//\n/g' | tail -n +2 >> $2 diff --git a/import-spreadsheet b/import-spreadsheet new file mode 100755 index 0000000..1ab6d66 --- /dev/null +++ b/import-spreadsheet @@ -0,0 +1,11 @@ +#!/usr/bin/env nix-shell +#!nix-shell -i bash -p libxml2 + +if [[ -z $1 || -z $2 ]]; then + echo "usage: import-spreadsheet SPREADSHEET OUTPUT_FILE" + exit 1 +fi + +tmpFile=$(mktemp) +./bin/python import.py $1 $tmpFile +./butcher-xml $tmpFile $2 diff --git a/import.py b/import.py new file mode 100644 index 0000000..881ce8e --- /dev/null +++ b/import.py @@ -0,0 +1,26 @@ +from pyexcel_ods import get_data +import sys +import xml.etree.cElementTree as ET + +def process_line(field_names, line, root): + """ + """ + if(len(line) <= 0): + return "" + line_dict = dict(enumerate(line)) + xml_line_node = ET.SubElement(root, "entry") + for field_index in range(len(field_names)): + # Python lists do not have a safe get. + # Converting it to a dict to get this safe get. + ET.SubElement(xml_line_node, field_names[field_index]).text = \ + str(line_dict.get(field_index,"")) + +if __name__ == '__main__': + spreadsheet_path = sys.argv[1] + out_path = sys.argv[2] + table = get_data(spreadsheet_path)['Sheet1'] + root = ET.Element("root") + for line in table[1:]: + process_line(table[0], line, root) + tree = ET.ElementTree(root) + tree.write(out_path, encoding="utf8")