-
Notifications
You must be signed in to change notification settings - Fork 22
/
zson--1.1.sql
148 lines (125 loc) · 4.4 KB
/
zson--1.1.sql
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
-- complain if script is sourced in psql, rather than via CREATE EXTENSION
\echo Use "CREATE EXTENSION zson" to load this file. \quit
CREATE TYPE zson;
CREATE TABLE zson_dict (
dict_id SERIAL NOT NULL,
word_id INTEGER NOT NULL,
word text NOT NULL,
PRIMARY KEY(dict_id, word_id)
);
SELECT pg_catalog.pg_extension_config_dump('zson_dict', '');
-- Usage: select zson_learn('{{"table1", "col1"}, {"table2", "col2"}, ... }');
CREATE FUNCTION zson_learn(
tables_and_columns text[][],
max_examples int default 10000,
min_length int default 2,
max_length int default 128,
min_count int default 2)
RETURNS text AS $$
DECLARE
tabname text;
colname text;
query text := '';
i int;
next_dict_id int;
BEGIN
IF cardinality(tables_and_columns) = 0 THEN
RAISE NOTICE 'First argument should not be an empty array!';
RETURN '';
END IF;
FOR i IN array_lower(tables_and_columns, 1) ..
array_upper(tables_and_columns, 1)
LOOP
tabname := tables_and_columns[i][1];
colname := tables_and_columns[i][2];
IF (tabname IS NULL) OR (colname IS NULL) THEN
RAISE NOTICE 'Invalid list of tables and columns!';
RETURN '';
ELSIF position('"' in tabname) <> 0 THEN
RAISE NOTICE 'Invalid table name %', tabname;
RETURN '';
ELSIF position('"' in colname) <> 0 THEN
RAISE NOTICE 'Invalid column name %', tabname;
RETURN '';
ELSIF position('.' in tabname) <> 0 THEN
tabname := quote_ident(split_part(tabname, '.', 1)) ||
'.' || quote_ident(split_part(tabname, '.', 2));
END IF;
IF query <> '' THEN
query := query || ' union all ';
END IF;
query := query || '( select unnest(zson_extract_strings(' ||
quote_ident(colname) || ')) as t from ' || tabname || ' limit ' ||
max_examples || ')';
END LOOP;
select coalesce(max(dict_id), -1) + 1 INTO next_dict_id from zson_dict;
query := 'select t from (select t, count(*) as sum from ( ' ||
query || ' ) as tt group by t) as s where length(t) >= ' ||
min_length || ' and length(t) <= ' || max_length ||
' and sum >= ' || min_count || ' order by sum desc limit 65534';
query := 'insert into zson_dict select ' || next_dict_id ||
' as dict_id, row_number() over () as word_id, t as word from ( ' ||
query || ' ) as top_words';
EXECUTE query;
RETURN 'Done! Run " select * from zson_dict where dict_id = ' ||
next_dict_id || '; " to see a dictionary.';
END;
$$ LANGUAGE plpgsql;
CREATE FUNCTION zson_extract_strings(x jsonb)
RETURNS text[] AS $$
DECLARE
jtype text;
jitem jsonb;
BEGIN
jtype := jsonb_typeof(x);
IF jtype = 'object' THEN
RETURN array(select unnest(z) from (
select array(select jsonb_object_keys(x)) as z
union all (
select zson_extract_strings(x -> k) as z from (
select jsonb_object_keys(x) as k
) as kk
)
) as zz);
ELSIF jtype = 'array' THEN
RETURN ARRAY(select unnest(zson_extract_strings(t)) from
(select jsonb_array_elements(x) as t) as tt);
ELSIF jtype = 'string' THEN
RETURN array[ x #>> array[] :: text[] ];
ELSE -- 'number', 'boolean', 'bool'
RETURN array[] :: text[];
END IF;
END;
$$ LANGUAGE plpgsql;
CREATE FUNCTION zson_in(cstring)
RETURNS zson
AS 'MODULE_PATHNAME'
LANGUAGE C STRICT IMMUTABLE;
CREATE FUNCTION zson_out(zson)
RETURNS cstring
AS 'MODULE_PATHNAME'
LANGUAGE C STRICT IMMUTABLE;
CREATE TYPE zson (
INTERNALLENGTH = -1,
INPUT = zson_in,
OUTPUT = zson_out,
STORAGE = extended -- try to compress
);
CREATE FUNCTION jsonb_to_zson(jsonb)
RETURNS zson
AS 'MODULE_PATHNAME'
LANGUAGE C STRICT IMMUTABLE;
CREATE FUNCTION zson_to_jsonb(zson)
RETURNS jsonb
AS 'MODULE_PATHNAME'
LANGUAGE C STRICT IMMUTABLE;
CREATE CAST (jsonb AS zson) WITH FUNCTION jsonb_to_zson(jsonb) AS ASSIGNMENT;
CREATE CAST (zson AS jsonb) WITH FUNCTION zson_to_jsonb(zson) AS IMPLICIT;
CREATE FUNCTION zson_info(zson)
RETURNS cstring
AS 'MODULE_PATHNAME'
LANGUAGE C STRICT IMMUTABLE;
--CREATE FUNCTION debug_dump_jsonb(jsonb)
-- RETURNS cstring
-- AS 'MODULE_PATHNAME'
-- LANGUAGE C STRICT IMMUTABLE;