|
| 1 | +.. _sqlalchemy-advanced-querying: |
| 2 | + |
| 3 | +============================= |
| 4 | +SQLAlchemy: Advanced querying |
| 5 | +============================= |
| 6 | + |
| 7 | +This section of the documentation demonstrates running queries with |
| 8 | +aggregations, and queries using a fulltext index with analyzer, both |
| 9 | +using the CrateDB SQLAlchemy dialect. |
| 10 | + |
| 11 | + |
| 12 | +.. rubric:: Table of Contents |
| 13 | + |
| 14 | +.. contents:: |
| 15 | + :local: |
| 16 | + |
| 17 | + |
| 18 | +Introduction |
| 19 | +============ |
| 20 | + |
| 21 | +Import the relevant symbols: |
| 22 | + |
| 23 | + >>> import sqlalchemy as sa |
| 24 | + >>> from sqlalchemy.ext.declarative import declarative_base |
| 25 | + >>> from sqlalchemy.orm import sessionmaker |
| 26 | + >>> from uuid import uuid4 |
| 27 | + |
| 28 | +Establish a connection to the database, see also :ref:`sa:engines_toplevel` |
| 29 | +and :ref:`connect`: |
| 30 | + |
| 31 | + >>> engine = sa.create_engine(f"crate://{crate_host}") |
| 32 | + >>> connection = engine.connect() |
| 33 | + |
| 34 | +Create an SQLAlchemy :doc:`Session <sa:orm/session_basics>`: |
| 35 | + |
| 36 | + >>> session = sessionmaker(bind=engine)() |
| 37 | + >>> Base = declarative_base() |
| 38 | + |
| 39 | + |
| 40 | +Introduction to fulltext indexes |
| 41 | +================================ |
| 42 | + |
| 43 | +:ref:`crate-reference:fulltext-indices` take the contents of one or more fields |
| 44 | +and split it up into tokens that are used for fulltext-search. The transformation |
| 45 | +from a text to separate tokens is done by an analyzer. In order to create |
| 46 | +fulltext search queries a :ref:`fulltext index with an analyzer |
| 47 | +<crate-reference:sql_ddl_index_fulltext>` must be defined for the related columns. |
| 48 | + |
| 49 | +In order to support fulltext query operations, the CrateDB SQLAlchemy dialect |
| 50 | +provides the :ref:`crate-reference:predicates_match` through its ``match`` function. |
| 51 | + |
| 52 | +For exercising those features, let's define a schema using SQLAlchemy's |
| 53 | +:ref:`sa:orm_declarative_mapping`: |
| 54 | + |
| 55 | + >>> def gen_key(): |
| 56 | + ... return str(uuid4()) |
| 57 | + |
| 58 | + >>> class Character(Base): |
| 59 | + ... __tablename__ = 'characters' |
| 60 | + ... id = sa.Column(sa.String, primary_key=True, default=gen_key) |
| 61 | + ... name = sa.Column(sa.String) |
| 62 | + ... quote = sa.Column(sa.String) |
| 63 | + ... name_ft = sa.Column(sa.String) |
| 64 | + ... quote_ft = sa.Column(sa.String) |
| 65 | + ... __mapper_args__ = { |
| 66 | + ... 'exclude_properties': ['name_ft', 'quote_ft'] |
| 67 | + ... } |
| 68 | + |
| 69 | +Please note that the schema defined above is provisioned to the database using |
| 70 | +the following SQL DDL statement. |
| 71 | + |
| 72 | +.. code-block:: sql |
| 73 | +
|
| 74 | + CREATE TABLE characters ( |
| 75 | + id STRING PRIMARY KEY, |
| 76 | + name STRING, |
| 77 | + quote STRING, |
| 78 | + INDEX name_ft USING fulltext(name) WITH (analyzer = 'english'), |
| 79 | + INDEX quote_ft USING fulltext(quote) WITH (analyzer = 'english') |
| 80 | + ) |
| 81 | +
|
| 82 | +.. note:: |
| 83 | + |
| 84 | + Currently, it is not supported to define a *named index column definition* |
| 85 | + using a :ref:`crate-reference:sql_ddl_index_fulltext` on behalf of the |
| 86 | + SQLAlchemy declarative schema. |
| 87 | + |
| 88 | +Let's add two records we use for testing. |
| 89 | + |
| 90 | + >>> arthur = Character(name='Arthur Dent') |
| 91 | + >>> arthur.quote = "Let's go somewhere." |
| 92 | + >>> session.add(arthur) |
| 93 | + |
| 94 | + >>> trillian = Character(name='Tricia McMillan') |
| 95 | + >>> trillian.quote = "We're on a space ship Arthur. In space." |
| 96 | + >>> session.add(trillian) |
| 97 | + |
| 98 | + >>> session.commit() |
| 99 | + |
| 100 | +After ``INSERT`` statements are submitted to the database, the newly inserted |
| 101 | +records aren't immediately available for retrieval because the index is only |
| 102 | +updated periodically (default: each second). In order to synchronize that, |
| 103 | +refresh the table: |
| 104 | + |
| 105 | + >>> _ = connection.execute(sa.text("REFRESH TABLE characters")) |
| 106 | + |
| 107 | + |
| 108 | +Aggregates: Counting and grouping |
| 109 | +================================= |
| 110 | + |
| 111 | +SQLAlchemy supports different approaches to issue a query with a count |
| 112 | +aggregate function. Take a look at the `count result rows`_ documentation |
| 113 | +for a full overview. |
| 114 | + |
| 115 | +CrateDB currently does not support all variants as it can not handle the |
| 116 | +sub-queries yet. |
| 117 | + |
| 118 | +This means that queries using ``count()`` have to be written in one of the |
| 119 | +following ways: |
| 120 | + |
| 121 | + >>> session.query(sa.func.count(Character.id)).scalar() |
| 122 | + 2 |
| 123 | + |
| 124 | + >>> session.query(sa.func.count('*')).select_from(Character).scalar() |
| 125 | + 2 |
| 126 | + |
| 127 | +Using the ``group_by`` clause is similar: |
| 128 | + |
| 129 | + >>> session.query(sa.func.count(Character.id), Character.name) \ |
| 130 | + ... .group_by(Character.name) \ |
| 131 | + ... .order_by(sa.desc(sa.func.count(Character.id))) \ |
| 132 | + ... .order_by(Character.name).all() |
| 133 | + [(1, 'Arthur Dent'), (1, 'Tricia McMillan')] |
| 134 | + |
| 135 | + |
| 136 | +Fulltext search with MATCH predicate |
| 137 | +==================================== |
| 138 | + |
| 139 | +Fulltext search in CrateDB is performed using the :ref:`crate-reference:predicates_match`. |
| 140 | +The CrateDB SQLAlchemy dialect comes with a ``match`` function, which can be used to |
| 141 | +search on one or multiple fields. |
| 142 | + |
| 143 | + >>> from crate.client.sqlalchemy.predicates import match |
| 144 | + |
| 145 | + >>> session.query(Character.name) \ |
| 146 | + ... .filter(match(Character.name_ft, 'Arthur')) \ |
| 147 | + ... .all() |
| 148 | + [('Arthur Dent',)] |
| 149 | + |
| 150 | +To get the relevance of a matching row, an internal system column ``_score`` |
| 151 | +can be selected. It is a numeric value which is relative to the other rows. |
| 152 | +The higher the score value, the more relevant the row. |
| 153 | + |
| 154 | +In most cases, ``_score`` is not part of the SQLAlchemy table definition, |
| 155 | +so it must be passed as a string: |
| 156 | + |
| 157 | + >>> session.query(Character.name, sa.literal_column('_score')) \ |
| 158 | + ... .filter(match(Character.quote_ft, 'space')) \ |
| 159 | + ... .all() |
| 160 | + [('Tricia McMillan', ...)] |
| 161 | + |
| 162 | +To search on multiple columns, pass a dictionary with columns and ``boost`` |
| 163 | +attached. ``boost`` is a factor that increases the relevance of a column in |
| 164 | +respect to the other columns: |
| 165 | + |
| 166 | + >>> session.query(Character.name) \ |
| 167 | + ... .filter(match({Character.name_ft: 1.5, Character.quote_ft: 0.1}, |
| 168 | + ... 'Arthur')) \ |
| 169 | + ... .order_by(sa.desc(sa.literal_column('_score'))) \ |
| 170 | + ... .all() |
| 171 | + [('Arthur Dent',), ('Tricia McMillan',)] |
| 172 | + |
| 173 | +The ``match_type`` argument determines how a single ``query_term`` is applied |
| 174 | +and how the resulting ``_score`` is computed. Thus, it influences which |
| 175 | +documents are considered more relevant. The default selection is ``best_fields``. |
| 176 | +For more information, see :ref:`crate-reference:predicates_match_types`. |
| 177 | + |
| 178 | +Results are ordered by ``_score`` by default, but can be overridden by adding |
| 179 | +an ``order_by()`` clause. |
| 180 | + |
| 181 | + >>> session.query(Character.name) \ |
| 182 | + ... .filter( |
| 183 | + ... match(Character.name_ft, 'Arth', |
| 184 | + ... match_type='phrase', |
| 185 | + ... options={'fuzziness': 3}) |
| 186 | + ... ) \ |
| 187 | + ... .all() |
| 188 | + [('Arthur Dent',)] |
| 189 | + |
| 190 | +It is not possible to specify options without the ``match_type`` argument: |
| 191 | + |
| 192 | + >>> session.query(Character.name) \ |
| 193 | + ... .filter( |
| 194 | + ... match(Character.name_ft, 'Arth', |
| 195 | + ... options={'fuzziness': 3}) |
| 196 | + ... ) \ |
| 197 | + ... .all() |
| 198 | + Traceback (most recent call last): |
| 199 | + ValueError: missing match_type. It's not allowed to specify options without match_type |
| 200 | + |
| 201 | + |
| 202 | +``INSERT...FROM SELECT`` |
| 203 | +======================== |
| 204 | + |
| 205 | +In SQLAlchemy, the ``insert().from_select()`` function returns a new ``Insert`` |
| 206 | +construct, which represents an ``INSERT...FROM SELECT`` statement. This |
| 207 | +functionality is supported by the CrateDB client library. Here is an example |
| 208 | +that uses ``insert().from_select()``. |
| 209 | + |
| 210 | +First, let's define and create the tables: |
| 211 | + |
| 212 | + >>> from sqlalchemy import select, insert |
| 213 | + |
| 214 | + >>> class Todos(Base): |
| 215 | + ... __tablename__ = 'todos' |
| 216 | + ... __table_args__ = { |
| 217 | + ... 'crate_number_of_replicas': '0' |
| 218 | + ... } |
| 219 | + ... id = sa.Column(sa.String, primary_key=True, default=gen_key) |
| 220 | + ... content = sa.Column(sa.String) |
| 221 | + ... status = sa.Column(sa.String) |
| 222 | + |
| 223 | + >>> class ArchivedTasks(Base): |
| 224 | + ... __tablename__ = 'archived_tasks' |
| 225 | + ... __table_args__ = { |
| 226 | + ... 'crate_number_of_replicas': '0' |
| 227 | + ... } |
| 228 | + ... id = sa.Column(sa.String, primary_key=True) |
| 229 | + ... content = sa.Column(sa.String) |
| 230 | + |
| 231 | + >>> Base.metadata.create_all(bind=engine) |
| 232 | + |
| 233 | +Let's add a task to the ``Todo`` table: |
| 234 | + |
| 235 | + >>> task = Todos(content='Write Tests', status='done') |
| 236 | + >>> session.add(task) |
| 237 | + >>> session.commit() |
| 238 | + >>> _ = connection.execute(sa.text("REFRESH TABLE todos")) |
| 239 | + |
| 240 | +Now, let's use ``insert().from_select()`` to archive the task into the |
| 241 | +``ArchivedTasks`` table: |
| 242 | + |
| 243 | + >>> sel = select([Todos.id, Todos.content]).where(Todos.status == "done") |
| 244 | + >>> ins = insert(ArchivedTasks).from_select(['id','content'], sel) |
| 245 | + >>> result = session.execute(ins) |
| 246 | + >>> session.commit() |
| 247 | + |
| 248 | +This will emit the following ``INSERT`` statement to the database:: |
| 249 | + |
| 250 | + INSERT INTO archived_tasks (id, content) |
| 251 | + (SELECT todos.id, todos.content FROM todos WHERE todos.status = 'done') |
| 252 | + |
| 253 | +Now, verify that the data is present in the database: |
| 254 | + |
| 255 | + >>> _ = connection.execute(sa.text("REFRESH TABLE archived_tasks")) |
| 256 | + >>> pprint([str(r) for r in session.execute("SELECT content FROM archived_tasks")]) |
| 257 | + ["('Write Tests',)"] |
| 258 | + |
| 259 | + |
| 260 | +.. hidden: Disconnect from database |
| 261 | +
|
| 262 | + >>> session.close() |
| 263 | + >>> connection.close() |
| 264 | + >>> engine.dispose() |
| 265 | +
|
| 266 | +
|
| 267 | +.. _count result rows: http://docs.sqlalchemy.org/en/14/orm/tutorial.html#counting |
0 commit comments