.mode tabs
.import comsimp2015.txt city
.import depts2015.txt departement
CREATE INDEX idx_city on city (DEP, COM);
CREATE INDEX idx_dept on departement (DEP);
.mode csv
.output result.csv
SELECT departement.DEP, departement.NCCENR AS DEP_NAME, COUNT(*)
FROM departement
JOIN city ON city.DEP = departement.DEP
GROUP BY departement.DEP, departement.NCCENR;
What will happen if there is an error in the second index ?
set -e
wget http://www.insee.fr/fr/methodes/nomenclatures/cog/telechargement/2015/txt/depts2015.txt
wget http://www.insee.fr/fr/methodes/nomenclatures/cog/telechargement/2015/txt/comsimp2015.zip
unzip comsimp2015.zip
sqlite3 cities.sqlite <<$script$
.headers ON
.mode tabs
.import comsimp2015.txt city
.import depts2015.txt departement
CREATE INDEX idx_city on city (DEP, COM);
CREATE INDEX idx_dept on city (DEP); -- <-- error here
.mode csv
.output result.csv
SELECT departement.DEP, departement.NCCENR AS DEP_NAME, COUNT(*)
FROM departement
JOIN city ON city.DEP = departement.DEP
GROUP BY departement.DEP, departement.NCCENR;
$script$
set -e
# wget http://www.insee.fr/fr/methodes/nomenclatures/cog/telechargement/2015/txt/depts2015.txt
# wget http://www.insee.fr/fr/methodes/nomenclatures/cog/telechargement/2015/txt/comsimp2015.zip
# unzip comsimp2015.zip
sqlite3 cities.sqlite <<$script$
.headers ON
.mode tabs
-- .import comsimp2015.txt city
-- .import depts2015.txt departement
-- CREATE INDEX idx_city on city (DEP, COM);
CREATE INDEX idx_dept on departement (DEP);
.mode csv
.output result.csv
SELECT departement.DEP, departement.NCCENR AS DEP_NAME, COUNT(*)
FROM departement
JOIN city ON city.DEP = departement.DEP
GROUP BY departement.DEP, departement.NCCENR;
$script$
rm cities.sqlite
rm depts2015.txt
rm comsimp2015.zip
set -e
wget http://www.insee.fr/fr/methodes/nomenclatures/cog/telechargement/2015/txt/depts2015.txt
wget http://www.insee.fr/fr/methodes/nomenclatures/cog/telechargement/2015/txt/comsimp2015.zip
unzip comsimp2015.zip
sqlite3 cities.sqlite <<$script$
.headers ON
.mode tabs
.import comsimp2015.txt city
.import depts2015.txt departement
CREATE INDEX idx_city on city (DEP, COM);
CREATE INDEX idx_dept on departement (DEP);
.mode csv
.output result.csv
SELECT departement.DEP, departement.NCCENR AS DEP_NAME, COUNT(*)
FROM departement
JOIN city ON city.DEP = departement.DEP
GROUP BY departement.DEP, departement.NCCENR;
$script$
set -e
if [ ! -e depts2015.txt ]; then
curl -O "http://www.insee.fr/fr/methodes/nomenclatures/cog/telechargement/2015/txt/depts2015.txt"
fi
if [ ! -f comsimp2015.zip ]; then
curl -O "http://www.insee.fr/fr/methodes/nomenclatures/cog/telechargement/2015/txt/comsimp2015.zip"
fi
if [ ! -f comsimp2015.txt ]; then
unzip comsimp2015.zip
fi
if ! echo "SELECT * FROM city LIMIT 0;" | sqlite3 cities.sqlite 2> /dev/null ; then
sqlite3 cities.sqlite <<$script$
.mode tabs
.import comsimp2015.txt city
CREATE INDEX idx_city on city (DEP, COM);
$script$
fi
if ! echo "SELECT * FROM departement LIMIT 0;" | sqlite3 cities.sqlite 2> /dev/null ; then
sqlite3 cities.sqlite <<$script$
.mode tabs
.import depts2015.txt departement
CREATE INDEX idx_dept on departement (DEP);
$script$
fi
if [ ! -f result.csv ]; then
sqlite3 cities.sqlite <<$script$
.headers ON
.mode csv
.output result.csv
SELECT departement.DEP, departement.NCCENR AS DEP_NAME, COUNT(*)
FROM departement
JOIN city ON city.DEP = departement.DEP
GROUP BY departement.DEP, departement.NCCENR;
$script$
fi
Forces to structure the code around chunks of data
if ! echo "SELECT * FROM departement LIMIT 0;" | sqlite3 cities.sqlite 2> /dev/null ; then
sqlite3 cities.sqlite <<$script$
.mode tabs
.import depts2015.txt departement
CREATE INDEX idx_dept on departement (DEP);
$script$
NB_DEPS=`echo "SELECT COUNT(*) FROM departement;" | sqlite3 cities.sqlite`
if [ $NB_DEPS -lt 100 ] ; then
echo "More than 100 departement expected but $NB_DEPS found"
exit 1
fi
fi
It's worth duplicating some data if you get rollback
ALTER TABLE city ADD COLUMN INSEE;
UPDATE city SET INSEE = DEP || COM;
CREATE TABLE city_insee AS
SELECT *, DEP || COM AS INSEE FROM city;
You can optimize size with views if needed
A developer cost at least 200 € a day
A 240 GB SSD costs 60 €
1h <==> 100 GB !
You can spend a lot of space to same time... Don't hesitate to duplicate data if you trade it with rollback !