Paso 1:use una función de ventana para obtener adyacente registros, evitando la autocombinación dolorosa (12 tablas está muy cerca del límite donde geqo asume el control):
copy(
WITH stuff AS (
SELECT c1.id , c1.source, c1.word
, LEAD ( c1.word, 1) OVER (www) AS c2w
, LEAD (c1.word, 2) OVER (www) AS c3w
, LEAD ( c1.word, 3) OVER (www) AS c4w
, LEAD (c1.lemma, 3) OVER (www) AS c4l
, LEAD (c1.pos, 3) OVER (www) AS c4p
, LEAD (c1.pos, 4) OVER (www) AS c5p
, LEAD (c1.word, 4) OVER (www) AS c5w
, LEAD (c1.word, 5) OVER (www) AS c6w
, LEAD (c1.lemma, 5) OVER (www) AS c6l
, LEAD (c1.word, 6) OVER (www) AS c7w
, LEAD (c1.pos, 6) OVER (www) AS c7p
, LEAD (c1.word, 7) OVER (www) AS c8w
, LEAD (c1.word, 8) OVER (www) AS c9w
, LEAD (c1.lemma, 8) OVER (www) AS c9l
, LEAD (c1.pos, 8) OVER (www) AS c9p
, LEAD (c1.word, 9) OVER (www) AS c10w
, LEAD (c1.word, 10) OVER (www) AS c11w
FROM orderedflatcorpus AS c1
WINDOW www AS (ORDER BY id)
)
SELECT id , source, word
, c2w
, c3w
, c4w
, c4l
, c4p
, c5w
, c6w
, c7w
, c8w
, c9w
, c9l
, c9p
, c10w
, c11w
FROM stuff
WHERE 1=1
AND c4p LIKE 'v%'
AND c5p = 'appge'
AND c6l = 'way'
AND c7p LIKE 'i%'
AND c8w = 'the'
AND c9p LIKE 'n%'
ORDER BY id
)
-- TO '/home/postgres/Results/OUTPUT.csv' DELIMITER E'\t' csv header;
TO '/tmp/OUTPUT2.csv' DELIMITER E'\t' csv header;
Paso 2:[modelo de datos] Las columnas {palabra, lema, pos} parecen ser un grupo de baja cardinalidad, puede comprimirlas en una tabla token/lemma/pos separada:
-- An index to speedup the unique extraction and final update
-- (the index will be dropped automatically
-- once the columns are dropped)
CREATE INDEX ON tmp.orderedflatcorpus (word, lemma, pos );
ANALYZE tmp.orderedflatcorpus;
-- table containing the "squeezed out" domain
CREATE TABLE tmp.words AS
SELECT DISTINCT word, lemma, pos
FROM tmp.orderedflatcorpus
;
ALTER TABLE tmp.words
ADD COLUMN id SERIAL NOT NULL PRIMARY KEY;
ALTER TABLE tmp.words
ADD UNIQUE (word , lemma, pos );
-- The original table needs an FK "link" to the new table
ALTER TABLE tmp.orderedflatcorpus
ADD column words_id INTEGER -- NOT NULL
REFERENCES tmp.words(id)
;
-- FK constraints are helped a lot by a supportive index.
CREATE INDEX orderedflatcorpus_words_id_fk ON tmp.orderedflatcorpus (words_id)
;
ANALYZE tmp.orderedflatcorpus;
ANALYZE tmp.words;
-- Initialize the FK column in the original table.
-- we need NOT DISTINCT FROM here, since the joined
-- columns could contain NULLs , which MUST compare equal.
-- ------------------------------------------------------
UPDATE tmp.orderedflatcorpus dst
SET words_id = src.id
FROM tmp.words src
WHERE src.word IS NOT DISTINCT FROM dst.word
AND dst.lemma IS NOT DISTINCT FROM src.lemma
AND dst.pos IS NOT DISTINCT FROM src.pos
;
ALTER TABLE tmp.orderedflatcorpus
DROP column word
, DROP column lemma
, DROP column pos
;
Y la nueva consulta, con un JOIN a la tabla de palabras:
copy(
WITH stuff AS (
SELECT c1.id , c1.source, w.word
, LEAD ( w.word, 1) OVER (www) AS c2w
, LEAD (w.word, 2) OVER (www) AS c3w
, LEAD ( w.word, 3) OVER (www) AS c4w
, LEAD (w.lemma, 3) OVER (www) AS c4l
, LEAD (w.pos, 3) OVER (www) AS c4p
, LEAD (w.pos, 4) OVER (www) AS c5p
, LEAD (w.word, 4) OVER (www) AS c5w
, LEAD (w.word, 5) OVER (www) AS c6w
, LEAD (w.lemma, 5) OVER (www) AS c6l
, LEAD (w.word, 6) OVER (www) AS c7w
, LEAD (w.pos, 6) OVER (www) AS c7p
, LEAD (w.word, 7) OVER (www) AS c8w
, LEAD (w.word, 8) OVER (www) AS c9w
, LEAD (w.lemma, 8) OVER (www) AS c9l
, LEAD (w.pos, 8) OVER (www) AS c9p
, LEAD (w.word, 9) OVER (www) AS c10w
, LEAD (w.word, 10) OVER (www) AS c11w
FROM orderedflatcorpus AS c1
JOIN words w ON w.id=c1.words_id
WINDOW www AS (ORDER BY c1.id)
)
SELECT id , source, word
, c2w , c3w
, c4w , c4l , c4p
, c5w
, c6w
, c7w
, c8w
, c9w , c9l , c9p
, c10w
, c11w
FROM stuff
WHERE 1=1
AND c4p LIKE 'v%'
AND c5p = 'appge'
AND c6l = 'way'
AND c7p LIKE 'i%'
AND c8w = 'the'
AND c9p LIKE 'n%'
ORDER BY id
)
-- TO '/home/postgres/Results/OUTPUT.csv' DELIMITER E'\t' csv header;
TO '/tmp/OUTPUT3.csv' DELIMITER E'\t' csv header;
Nota:Obtengo dos líneas en la salida, porque relajé demasiado las condiciones...
Actualizar :la primera consulta, evitando el CTE
copy(
SELECT id , source, word
, c2w
, c3w
, c4w
, c4l
, c4p
, c5w
, c6w
, c7w
, c8w
, c9w
, c9l
, c9p
, c10w
, c11w
FROM (
SELECT c1.id , c1.source, c1.word
, LEAD ( c1.word, 1) OVER (www) AS c2w
, LEAD (c1.word, 2) OVER (www) AS c3w
, LEAD ( c1.word, 3) OVER (www) AS c4w
, LEAD (c1.lemma, 3) OVER (www) AS c4l
, LEAD (c1.pos, 3) OVER (www) AS c4p
, LEAD (c1.pos, 4) OVER (www) AS c5p
, LEAD (c1.word, 4) OVER (www) AS c5w
, LEAD (c1.word, 5) OVER (www) AS c6w
, LEAD (c1.lemma, 5) OVER (www) AS c6l
, LEAD (c1.word, 6) OVER (www) AS c7w
, LEAD (c1.pos, 6) OVER (www) AS c7p
, LEAD (c1.word, 7) OVER (www) AS c8w
, LEAD (c1.word, 8) OVER (www) AS c9w
, LEAD (c1.lemma, 8) OVER (www) AS c9l
, LEAD (c1.pos, 8) OVER (www) AS c9p
, LEAD (c1.word, 9) OVER (www) AS c10w
, LEAD (c1.word, 10) OVER (www) AS c11w
FROM orderedflatcorpus AS c1
WINDOW www AS (ORDER BY id)
) stuff
WHERE 1=1
AND c4p LIKE 'v%'
AND c5p = 'appge'
AND c6l = 'way'
AND c7p LIKE 'i%'
AND c8w = 'the'
AND c9p LIKE 'n%'
ORDER BY id
)
-- TO '/home/postgres/Results/OUTPUT.csv' DELIMITER E'\t' csv header;
TO '/tmp/OUTPUT2a.csv' DELIMITER E'\t' csv header;
[se podría realizar una transformación similar en la segunda consulta]
ACTUALIZACIÓN2 La versión de la subconsulta para la variante de dos tablas.
-- copy(
-- EXPLAIN ANALYZE
SELECT c1i, c1s, c1w
, c2w , c3w
, c4w , c4l , c4p
, c5w
, c6w
, c7w
, c8w
, c9w , c9l , c9p
, c10w
, c11w
FROM (
SELECT c1.id AS c1i
, c1.source AS c1s
, w1.word AS c1w
, LEAD (w1.word, 1) OVER www AS c2w
, LEAD (w1.word, 2) OVER www AS c3w
, LEAD (w1.word, 3) OVER www AS c4w
, LEAD (w1.lemma, 3) OVER www AS c4l
, LEAD (w1.pos, 3) OVER www AS c4p
, LEAD (w1.pos, 4) OVER www AS c5p
, LEAD (w1.word, 4) OVER www AS c5w
, LEAD (w1.word, 5) OVER www AS c6w
, LEAD (w1.lemma, 5) OVER www AS c6l
, LEAD (w1.word, 6) OVER www AS c7w
, LEAD (w1.pos, 6) OVER www AS c7p
, LEAD (w1.word, 7) OVER www AS c8w
, LEAD (w1.word, 8) OVER www AS c9w
, LEAD (w1.lemma, 8) OVER www AS c9l
, LEAD (w1.pos, 8) OVER www AS c9p
, LEAD (w1.word, 9) OVER www AS c10w
, LEAD (w1.word, 10) OVER www AS c11w
FROM orderedflatcorpus c1
JOIN words w1 ON w1.id=c1.words_id
WHERE 1=1
/* These *could* to prune out unmatched items, but I could not get it to work ...
AND EXISTS (SELECT *FROM orderedflatcorpus c4 JOIN words w4 ON w4.id=c4.words_id
WHERE c4.id = 3+c1.id -- AND w4.pos LIKE 'v%'
) -- OMG
AND EXISTS (SELECT *FROM orderedflatcorpus c5 JOIN words w5 ON w5.id=c5.words_id
WHERE c5.id = 4+c1.id -- AND w5.pos = 'appge'
) -- OMG
AND EXISTS (SELECT *FROM orderedflatcorpus c7 JOIN words w7 ON w7.id=c7.words_id
WHERE c7.id = 6+c1.id -- AND w7.pos LIKE 'i%'
) -- OMG
AND EXISTS (SELECT *FROM orderedflatcorpus c9 JOIN words w9 ON w9.id=c9.words_id
WHERE c9.id = 8+c1.id -- AND w9.pos LIKE 'n%'
) -- OMG
AND EXISTS (SELECT *FROM orderedflatcorpus c8 JOIN words w8 ON w8.id=c8.words_id
WHERE c8.id = 7+c1.id -- AND w8.word = 'the'
) -- OMG
*/
WINDOW www AS (ORDER BY c1.id ROWS BETWEEN CURRENT ROW AND 10 FOLLOWING)
) stuff
WHERE 1=1
AND c4p LIKE 'v%'
AND c5p = 'appge'
AND c6l = 'way'
AND c7p LIKE 'i%'
AND c8w = 'the'
AND c9p LIKE 'n%'
ORDER BY c1i
;
-- )
-- TO '/home/postgres/Results/OUTPUT.csv' DELIMITER E'\t' csv header;
-- TO '/tmp/OUTPUT3b.csv' DELIMITER E'\t' csv header;