Finished Release v2.1.0

This commit is contained in:
Itai Shirav 2020-07-16 07:22:18 +03:00
commit ebdadb3aee
71 changed files with 2218 additions and 216 deletions

View File

@ -1,6 +1,20 @@
Change Log
==========
v2.1.0
------
- Support for model constraints
- Support for data skipping indexes
- Support for mutations: `QuerySet.update` and `QuerySet.delete`
- Added functions for working with external dictionaries
- Support FINAL for `ReplacingMergeTree` (chripede)
- Added `DateTime64Field` (NiyazNz)
- Make `DateTimeField` and `DateTime64Field` timezone-aware (NiyazNz)
**Backwards incompatible changes**
Previously, `DateTimeField` always converted its value from the database timezone to UTC. This is no longer the case: the field's value now preserves the timezone it was defined with, or if not specified - the database's global timezone. This change has no effect if your database timezone is set to UTC.
v2.0.1
------
- Remove unnecessary import of `six`

View File

@ -52,4 +52,6 @@ for row in queryset.aggregate(CPUStats.cpu_id, average=F.avg(CPUStats.cpu_percen
print('CPU {row.cpu_id}: {row.average:.2f}%'.format(row=row))
```
This and other examples can be found in the `examples` folder.
To learn more please visit the [documentation](docs/toc.md).

View File

@ -178,7 +178,7 @@ Unrecognized field names will cause an `AttributeError`.
#### Model.create_table_sql(db)
Returns the SQL command for creating a table for this model.
Returns the SQL statement for creating a table for this model.
#### Model.drop_table_sql(db)
@ -203,7 +203,7 @@ The `field_names` list must match the fields defined in the model, but does not
- `line`: the TSV-formatted data.
- `field_names`: names of the model fields in the data.
- `timezone_in_use`: the timezone to use when parsing dates and datetimes.
- `timezone_in_use`: the timezone to use when parsing dates and datetimes. Some fields use their own timezones.
- `database`: if given, sets the database that this instance belongs to.
@ -308,7 +308,7 @@ Unrecognized field names will cause an `AttributeError`.
#### BufferModel.create_table_sql(db)
Returns the SQL command for creating a table for this model.
Returns the SQL statement for creating a table for this model.
#### BufferModel.drop_table_sql(db)
@ -333,7 +333,7 @@ The `field_names` list must match the fields defined in the model, but does not
- `line`: the TSV-formatted data.
- `field_names`: names of the model fields in the data.
- `timezone_in_use`: the timezone to use when parsing dates and datetimes.
- `timezone_in_use`: the timezone to use when parsing dates and datetimes. Some fields use their own timezones.
- `database`: if given, sets the database that this instance belongs to.
@ -422,12 +422,147 @@ Returns the instance's column values as a tab-separated line. A newline is not i
- `include_readonly`: if false, returns only fields that can be inserted into database.
### MergeModel
Extends Model
Model for Merge engine
Predefines virtual _table column an controls that rows can't be inserted to this table type
https://clickhouse.tech/docs/en/single/index.html#document-table_engines/merge
#### MergeModel(**kwargs)
Creates a model instance, using keyword arguments as field values.
Since values are immediately converted to their Pythonic type,
invalid values will cause a `ValueError` to be raised.
Unrecognized field names will cause an `AttributeError`.
#### MergeModel.create_table_sql(db)
Returns the SQL statement for creating a table for this model.
#### MergeModel.drop_table_sql(db)
Returns the SQL command for deleting this model's table.
#### MergeModel.fields(writable=False)
Returns an `OrderedDict` of the model's fields (from name to `Field` instance).
If `writable` is true, only writable fields are included.
Callers should not modify the dictionary.
#### MergeModel.from_tsv(line, field_names, timezone_in_use=UTC, database=None)
Create a model instance from a tab-separated line. The line may or may not include a newline.
The `field_names` list must match the fields defined in the model, but does not have to include all of them.
- `line`: the TSV-formatted data.
- `field_names`: names of the model fields in the data.
- `timezone_in_use`: the timezone to use when parsing dates and datetimes. Some fields use their own timezones.
- `database`: if given, sets the database that this instance belongs to.
#### get_database()
Gets the `Database` that this model instance belongs to.
Returns `None` unless the instance was read from the database or written to it.
#### get_field(name)
Gets a `Field` instance given its name, or `None` if not found.
#### MergeModel.has_funcs_as_defaults()
Return True if some of the model's fields use a function expression
as a default value. This requires special handling when inserting instances.
#### MergeModel.is_read_only()
Returns true if the model is marked as read only.
#### MergeModel.is_system_model()
Returns true if the model represents a system table.
#### MergeModel.objects_in(database)
Returns a `QuerySet` for selecting instances of this model class.
#### set_database(db)
Sets the `Database` that this model instance belongs to.
This is done automatically when the instance is read from the database or written to it.
#### MergeModel.table_name()
Returns the model's database table name. By default this is the
class name converted to lowercase. Override this if you want to use
a different table name.
#### to_db_string()
Returns the instance as a bytestring ready to be inserted into the database.
#### to_dict(include_readonly=True, field_names=None)
Returns the instance's column values as a dict.
- `include_readonly`: if false, returns only fields that can be inserted into database.
- `field_names`: an iterable of field names to return (optional)
#### to_tskv(include_readonly=True)
Returns the instance's column keys and values as a tab-separated line. A newline is not included.
Fields that were not assigned a value are omitted.
- `include_readonly`: if false, returns only fields that can be inserted into database.
#### to_tsv(include_readonly=True)
Returns the instance's column values as a tab-separated line. A newline is not included.
- `include_readonly`: if false, returns only fields that can be inserted into database.
### DistributedModel
Extends Model
Model for Distributed engine
Model class for use with a `Distributed` engine.
#### DistributedModel(**kwargs)
@ -441,6 +576,9 @@ Unrecognized field names will cause an `AttributeError`.
#### DistributedModel.create_table_sql(db)
Returns the SQL statement for creating a table for this model.
#### DistributedModel.drop_table_sql(db)
@ -496,7 +634,7 @@ The `field_names` list must match the fields defined in the model, but does not
- `line`: the TSV-formatted data.
- `field_names`: names of the model fields in the data.
- `timezone_in_use`: the timezone to use when parsing dates and datetimes.
- `timezone_in_use`: the timezone to use when parsing dates and datetimes. Some fields use their own timezones.
- `database`: if given, sets the database that this instance belongs to.
@ -541,6 +679,10 @@ Returns a `QuerySet` for selecting instances of this model class.
#### set_database(db)
Sets the `Database` that this model instance belongs to.
This is done automatically when the instance is read from the database or written to it.
#### DistributedModel.table_name()
@ -581,6 +723,94 @@ Returns the instance's column values as a tab-separated line. A newline is not i
- `include_readonly`: if false, returns only fields that can be inserted into database.
### Constraint
Defines a model constraint.
#### Constraint(expr)
Initializer. Expects an expression that ClickHouse will verify when inserting data.
#### create_table_sql()
Returns the SQL statement for defining this constraint during table creation.
### Index
Defines a data-skipping index.
#### Index(expr, type, granularity)
Initializer.
- `expr` - a column, expression, or tuple of columns and expressions to index.
- `type` - the index type. Use one of the following methods to specify the type:
`Index.minmax`, `Index.set`, `Index.ngrambf_v1`, `Index.tokenbf_v1` or `Index.bloom_filter`.
- `granularity` - index block size (number of multiples of the `index_granularity` defined by the engine).
#### bloom_filter()
An index that stores a Bloom filter containing values of the index expression.
- `false_positive` - the probability (between 0 and 1) of receiving a false positive
response from the filter
#### create_table_sql()
Returns the SQL statement for defining this index during table creation.
#### minmax()
An index that stores extremes of the specified expression (if the expression is tuple, then it stores
extremes for each element of tuple). The stored info is used for skipping blocks of data like the primary key.
#### ngrambf_v1(size_of_bloom_filter_in_bytes, number_of_hash_functions, random_seed)
An index that stores a Bloom filter containing all ngrams from a block of data.
Works only with strings. Can be used for optimization of equals, like and in expressions.
- `n` — ngram size
- `size_of_bloom_filter_in_bytes` — Bloom filter size in bytes (you can use large values here,
for example 256 or 512, because it can be compressed well).
- `number_of_hash_functions` — The number of hash functions used in the Bloom filter.
- `random_seed` — The seed for Bloom filter hash functions.
#### set()
An index that stores unique values of the specified expression (no more than max_rows rows,
or unlimited if max_rows=0). Uses the values to check if the WHERE expression is not satisfiable
on a block of data.
#### tokenbf_v1(number_of_hash_functions, random_seed)
An index that stores a Bloom filter containing string tokens. Tokens are sequences
separated by non-alphanumeric characters.
- `size_of_bloom_filter_in_bytes` — Bloom filter size in bytes (you can use large values here,
for example 256 or 512, because it can be compressed well).
- `number_of_hash_functions` — The number of hash functions used in the Bloom filter.
- `random_seed` — The seed for Bloom filter hash functions.
infi.clickhouse_orm.fields
--------------------------
@ -628,11 +858,18 @@ Extends Field
#### DateField(default=None, alias=None, materialized=None, readonly=None, codec=None)
### DateTime64Field
Extends DateTimeField
#### DateTime64Field(default=None, alias=None, materialized=None, readonly=None, codec=None, timezone=None, precision=6)
### DateTimeField
Extends Field
#### DateTimeField(default=None, alias=None, materialized=None, readonly=None, codec=None)
#### DateTimeField(default=None, alias=None, materialized=None, readonly=None, codec=None, timezone=None)
### Decimal128Field
@ -955,6 +1192,13 @@ Returns the contents of the query's `WHERE` or `PREWHERE` clause as a string.
Returns the number of matching model instances.
#### delete()
Deletes all records matched by this queryset's conditions.
Note that ClickHouse performs deletions in the background, so they are not immediate.
#### distinct()
@ -980,7 +1224,7 @@ Pass `prewhere=True` to apply the conditions as PREWHERE instead of WHERE.
Adds a FINAL modifier to table, meaning data will be collapsed to final version.
Can be used with `CollapsingMergeTree` engine only.
Can be used with the `CollapsingMergeTree` and `ReplacingMergeTree` engines only.
#### limit_by(offset_limit, *fields_or_expr)
@ -1031,6 +1275,14 @@ The result is a namedtuple containing `objects` (list), `number_of_objects`,
Returns the selected fields or expressions as a SQL string.
#### update(**kwargs)
Updates all records matched by this queryset's conditions.
Keyword arguments specify the field names and expressions to use for the update.
Note that ClickHouse performs updates in the background, so they are not immediate.
### AggregateQuerySet
Extends QuerySet
@ -1078,6 +1330,13 @@ Returns the contents of the query's `WHERE` or `PREWHERE` clause as a string.
Returns the number of rows after aggregation.
#### delete()
Deletes all records matched by this queryset's conditions.
Note that ClickHouse performs deletions in the background, so they are not immediate.
#### distinct()
@ -1103,7 +1362,7 @@ Pass `prewhere=True` to apply the conditions as PREWHERE instead of WHERE.
Adds a FINAL modifier to table, meaning data will be collapsed to final version.
Can be used with `CollapsingMergeTree` engine only.
Can be used with the `CollapsingMergeTree` and `ReplacingMergeTree` engines only.
#### group_by(*args)
@ -1160,6 +1419,14 @@ The result is a namedtuple containing `objects` (list), `number_of_objects`,
Returns the selected fields or expressions as a SQL string.
#### update(**kwargs)
Updates all records matched by this queryset's conditions.
Keyword arguments specify the field names and expressions to use for the update.
Note that ClickHouse performs updates in the background, so they are not immediate.
#### with_totals()
@ -1646,6 +1913,21 @@ Initializer.
#### covarSampOrNullIf(y, cond)
#### dictGet(attr_name, id_expr)
#### dictGetHierarchy(id_expr)
#### dictGetOrDefault(attr_name, id_expr, default)
#### dictHas(id_expr)
#### dictIsIn(child_id_expr, ancestor_id_expr)
#### divide(**kwargs)
@ -2507,6 +2789,15 @@ Initializer.
#### toDateTime(**kwargs)
#### toDateTime64(**kwargs)
#### toDateTime64OrNull(precision, timezone=NO_VALUE)
#### toDateTime64OrZero(precision, timezone=NO_VALUE)
#### toDateTimeOrNull()

View File

@ -10,7 +10,8 @@ The following field types are supported:
| StringField | String | str | Encoded as UTF-8 when written to ClickHouse
| FixedStringField | FixedString| str | Encoded as UTF-8 when written to ClickHouse
| DateField | Date | datetime.date | Range 1970-01-01 to 2105-12-31
| DateTimeField | DateTime | datetime.datetime | Minimal value is 1970-01-01 00:00:00; Always in UTC
| DateTimeField | DateTime | datetime.datetime | Minimal value is 1970-01-01 00:00:00; Timezone aware
| DateTime64Field | DateTime64 | datetime.datetime | Minimal value is 1970-01-01 00:00:00; Timezone aware
| Int8Field | Int8 | int | Range -128 to 127
| Int16Field | Int16 | int | Range -32768 to 32767
| Int32Field | Int32 | int | Range -2147483648 to 2147483647
@ -37,16 +38,22 @@ The following field types are supported:
DateTimeField and Time Zones
----------------------------
A `DateTimeField` can be assigned values from one of the following types:
`DateTimeField` and `DateTime64Field` can accept a `timezone` parameter (either the timezone name or a `pytz` timezone instance). This timezone will be used as the column timezone in ClickHouse. If not provided, the fields will use the timezone defined in the database configuration.
A `DateTimeField` and `DateTime64Field` can be assigned values from one of the following types:
- datetime
- date
- integer - number of seconds since the Unix epoch
- float (DateTime64Field only) - number of seconds and microseconds since the Unix epoch
- string in `YYYY-MM-DD HH:MM:SS` format or [ISO 8601](https://en.wikipedia.org/wiki/ISO_8601)-compatible format
The assigned value always gets converted to a timezone-aware `datetime` in UTC. If the assigned value is a timezone-aware `datetime` in another timezone, it will be converted to UTC. Otherwise, the assigned value is assumed to already be in UTC.
The assigned value always gets converted to a timezone-aware `datetime` in UTC. The only exception is when the assigned value is a timezone-aware `datetime`, in which case it will not be changed.
DateTime values that are read from the database are kept in the database-defined timezone - either the one defined for the field, or the global timezone defined in the database configuration.
It is strongly recommended to set the server timezone to UTC and to store all datetime values in that timezone, in order to prevent confusion and subtle bugs. Conversion to a different timezone should only be performed when the value needs to be displayed.
DateTime values that are read from the database are also converted to UTC. ClickHouse formats them according to the timezone of the server, and the ORM makes the necessary conversions. This requires a ClickHouse version which is new enough to support the `timezone()` function, otherwise it is assumed to be using UTC. In any case, we recommend settings the server timezone to UTC in order to prevent confusion.
Working with enum fields
------------------------

View File

@ -9,7 +9,7 @@ Defining Models
---------------
Models are defined in a way reminiscent of Django's ORM, by subclassing `Model`:
```python
from infi.clickhouse_orm import Model, StringField, DateField, Float32Field, MergeTree
class Person(Model):
@ -20,6 +20,7 @@ Models are defined in a way reminiscent of Django's ORM, by subclassing `Model`:
height = Float32Field()
engine = MergeTree('birthday', ('first_name', 'last_name', 'birthday'))
```
The columns in the database table are represented by model fields. Each field has a type, which matches the type of the corresponding database column. All the supported fields types are listed [here](field_types.md).
@ -66,7 +67,7 @@ For additional details see [here](field_options.md).
### Table Names
The table name used for the model is its class name, converted to lowercase. To override the default name, implement the `table_name` method:
```python
class Person(Model):
...
@ -74,6 +75,38 @@ The table name used for the model is its class name, converted to lowercase. To
@classmethod
def table_name(cls):
return 'people'
```
### Model Constraints
It is possible to define constraints which ClickHouse verifies when data is inserted. Trying to insert invalid records will raise a `ServerError`. Each constraint has a name and an expression to validate. For example:
```python
class Person(Model):
...
# Ensure that the birthday is not a future date
birthday_is_in_the_past = Constraint(birthday <= F.today())
```
### Data Skipping Indexes
Models that use an engine from the `MergeTree` family can define additional indexes over one or more columns or expressions. These indexes are used in SELECT queries for reducing the amount of data to read from the disk by skipping big blocks of data that do not satisfy the query's conditions.
For example:
```python
class Person(Model):
...
# A minmax index that can help find people taller or shorter than some height
height_index = Index(height, type=Index.minmax(), granularity=2)
# A trigram index that can help find substrings inside people names
names_index = Index((F.lower(first_name), F.lower(last_name)),
type=Index.ngrambf_v1(3, 256, 2, 0), granularity=1)
```
Using Models
------------

View File

@ -151,7 +151,7 @@ Adds a DISTINCT clause to the query, meaning that any duplicate rows in the resu
94
Final
--------
-----
This method can be used only with `CollapsingMergeTree` engine.
Adds a FINAL modifier to the query, meaning that the selected data is fully "collapsed" by the engine's sign field.
@ -203,6 +203,23 @@ The `paginate` method returns a `namedtuple` containing the following fields:
Note that you should use `QuerySet.order_by` so that the ordering is unique, otherwise there might be inconsistencies in the pagination (such as an instance that appears on two different pages).
Mutations
---------
To delete all records that match a queryset's conditions use the `delete` method:
Person.objects_in(database).filter(first_name='Max').delete()
To update records that match a queryset's conditions call the `update` method and provide the field names to update and the expressions to use (as keyword arguments):
Person.objects_in(database).filter(first_name='Max').update(first_name='Maximilian')
Note a few caveats:
- ClickHouse cannot update columns that are used in the calculation of the primary or the partition key.
- Mutations happen in the background, so they are not immediate.
- Only tables in the `MergeTree` family support mutations.
Aggregation
-----------

View File

@ -33,19 +33,19 @@ Each migration file is expected to contain a list of `operations`, for example:
The following operations are supported:
**CreateTable**
### CreateTable
A migration operation that creates a table for a given model class. If the table already exists, the operation does nothing.
In case the model class is a `BufferModel`, the operation first creates the underlying on-disk table, and then creates the buffer table.
**DropTable**
### DropTable
A migration operation that drops the table of a given model class. If the table does not exist, the operation does nothing.
**AlterTable**
### AlterTable
A migration operation that compares the table of a given model class to the models fields, and alters the table to match the model. The operation can:
@ -56,14 +56,19 @@ A migration operation that compares the table of a given model class to the mode
Default values are not altered by this operation.
**AlterTableWithBuffer**
### AlterTableWithBuffer
A compound migration operation for altering a buffer table and its underlying on-disk table. The buffer table is dropped, the on-disk table is altered, and then the buffer table is re-created. This is the procedure recommended in the ClickHouse documentation for handling scenarios in which the underlying table needs to be modified.
Applying this migration operation to a regular table has the same effect as an `AlterTable` operation.
**RunPython**
### AlterConstraints
A migration operation that adds new constraints from the model to the database table, and drops obsolete ones. Constraints are identified by their names, so a change in an existing constraint will not be detected unless its name was changed too. ClickHouse does not check that the constraints hold for existing data in the table.
### RunPython
A migration operation that runs a Python function. The function receives the `Database` instance to operate on.
@ -77,9 +82,9 @@ A migration operation that runs a Python function. The function receives the `Da
]
**RunSQL**
### RunSQL
A migration operation that runs raw SQL queries. It expects a string containing an SQL query, or an array of SQL-query strings.
A migration operation that runs raw SQL statements. It expects a string containing an SQL statements, or a list of statements.
Example:

View File

@ -10,6 +10,8 @@
* [Materialized fields](models_and_databases.md#materialized-fields)
* [Alias fields](models_and_databases.md#alias-fields)
* [Table Names](models_and_databases.md#table-names)
* [Model Constraints](models_and_databases.md#model-constraints)
* [Data Skipping Indexes](models_and_databases.md#data-skipping-indexes)
* [Using Models](models_and_databases.md#using-models)
* [Inserting to the Database](models_and_databases.md#inserting-to-the-database)
* [Reading from the Database](models_and_databases.md#reading-from-the-database)
@ -30,6 +32,7 @@
* [Final](querysets.md#final)
* [Slicing](querysets.md#slicing)
* [Pagination](querysets.md#pagination)
* [Mutations](querysets.md#mutations)
* [Aggregation](querysets.md#aggregation)
* [Adding totals](querysets.md#adding-totals)
@ -58,6 +61,13 @@
* [Schema Migrations](schema_migrations.md#schema-migrations)
* [Writing Migrations](schema_migrations.md#writing-migrations)
* [CreateTable](schema_migrations.md#createtable)
* [DropTable](schema_migrations.md#droptable)
* [AlterTable](schema_migrations.md#altertable)
* [AlterTableWithBuffer](schema_migrations.md#altertablewithbuffer)
* [AlterConstraints](schema_migrations.md#alterconstraints)
* [RunPython](schema_migrations.md#runpython)
* [RunSQL](schema_migrations.md#runsql)
* [Running Migrations](schema_migrations.md#running-migrations)
* [System Models](system_models.md#system-models)
@ -74,13 +84,17 @@
* [infi.clickhouse_orm.models](class_reference.md#inficlickhouse_ormmodels)
* [Model](class_reference.md#model)
* [BufferModel](class_reference.md#buffermodel)
* [MergeModel](class_reference.md#mergemodel)
* [DistributedModel](class_reference.md#distributedmodel)
* [Constraint](class_reference.md#constraint)
* [Index](class_reference.md#index)
* [infi.clickhouse_orm.fields](class_reference.md#inficlickhouse_ormfields)
* [ArrayField](class_reference.md#arrayfield)
* [BaseEnumField](class_reference.md#baseenumfield)
* [BaseFloatField](class_reference.md#basefloatfield)
* [BaseIntField](class_reference.md#baseintfield)
* [DateField](class_reference.md#datefield)
* [DateTime64Field](class_reference.md#datetime64field)
* [DateTimeField](class_reference.md#datetimefield)
* [Decimal128Field](class_reference.md#decimal128field)
* [Decimal32Field](class_reference.md#decimal32field)

1
examples/cpu_usage/.gitignore vendored Normal file
View File

@ -0,0 +1 @@
/env/

View File

@ -0,0 +1,22 @@
# CPU Usage
This basic example uses `psutil` to collect a simple time-series of per-CPU usage percent. It then prints out some aggregate statistics based on the collected data.
## Running the code
Create a virtualenv and install the required libraries:
```
virtualenv -p python3.6 env
source env/bin/activate
pip install -r requirements.txt
```
Run the `collect` script to populate the database with the CPU statistics. Let it run for a bit before pressing CTRL+C.
```
python collect.py
```
Run the `results` script to display the CPU statistics:
```
python results.py
```

View File

@ -0,0 +1,20 @@
import psutil, time, datetime
from infi.clickhouse_orm import Database
from models import CPUStats
db = Database('demo')
db.create_table(CPUStats)
psutil.cpu_percent(percpu=True) # first sample should be discarded
while True:
time.sleep(1)
stats = psutil.cpu_percent(percpu=True)
timestamp = datetime.datetime.now()
print(timestamp)
db.insert([
CPUStats(timestamp=timestamp, cpu_id=cpu_id, cpu_percent=cpu_percent)
for cpu_id, cpu_percent in enumerate(stats)
])

View File

@ -0,0 +1,11 @@
from infi.clickhouse_orm import Model, DateTimeField, UInt16Field, Float32Field, Memory
class CPUStats(Model):
timestamp = DateTimeField()
cpu_id = UInt16Field()
cpu_percent = Float32Field()
engine = Memory()

View File

@ -0,0 +1,2 @@
infi.clickhouse_orm
psutil

View File

@ -0,0 +1,13 @@
from infi.clickhouse_orm import Database, F
from models import CPUStats
db = Database('demo')
queryset = CPUStats.objects_in(db)
total = queryset.filter(CPUStats.cpu_id == 1).count()
busy = queryset.filter(CPUStats.cpu_id == 1, CPUStats.cpu_percent > 95).count()
print('CPU 1 was busy {:.2f}% of the time'.format(busy * 100.0 / total))
# Calculate the average usage per CPU
for row in queryset.aggregate(CPUStats.cpu_id, average=F.avg(CPUStats.cpu_percent)):
print('CPU {row.cpu_id}: {row.average:.2f}%'.format(row=row))

1
examples/db_explorer/.gitignore vendored Normal file
View File

@ -0,0 +1 @@
/env/

View File

@ -0,0 +1,36 @@
# DB Explorer
This is a simple Flask web application that connects to ClickHouse and displays the list of existing databases. Clicking on a database name drills down into it, showing its list of tables. Clicking on a table drills down further, showing details about the table and its columns.
For each table or column, the application displays the compressed size on disk, the uncompressed size, and the ratio between them. Additionally, several pie charts are shown - top tables by size, top tables by rows, and top columns by size (in a table).
The pie charts are generated using the `pygal` charting library.
ORM concepts that are demonstrated by this example:
- Creating ORM models from existing tables using `Database.get_model_for_table`
- Queryset filtering
- Queryset aggregation
## Running the code
Create a virtualenv and install the required libraries:
```
virtualenv -p python3.6 env
source env/bin/activate
pip install -r requirements.txt
```
Run the server and open http://127.0.0.1:5000/ in your browser:
```
python server.py
```
By default the server connects to ClickHouse running on http://localhost:8123/ without a username or password, but you can change this using command line arguments:
```
python server.py http://myclickhouse:8123/
```
or:
```
python server.py http://myclickhouse:8123/ admin secret123
```

View File

@ -0,0 +1,62 @@
import pygal
from pygal.style import RotateStyle
from jinja2.filters import do_filesizeformat
# Formatting functions
number_formatter = lambda v: '{:,}'.format(v)
bytes_formatter = lambda v: do_filesizeformat(v, True)
def tables_piechart(db, by_field, value_formatter):
'''
Generate a pie chart of the top n tables in the database.
`db` - the database instance
`by_field` - the field name to sort by
`value_formatter` - a function to use for formatting the numeric values
'''
Tables = db.get_model_for_table('tables', system_table=True)
qs = Tables.objects_in(db).filter(database=db.db_name, is_temporary=False).exclude(engine='Buffer')
tuples = [(getattr(table, by_field), table.name) for table in qs]
return _generate_piechart(tuples, value_formatter)
def columns_piechart(db, tbl_name, by_field, value_formatter):
'''
Generate a pie chart of the top n columns in the table.
`db` - the database instance
`tbl_name` - the table name
`by_field` - the field name to sort by
`value_formatter` - a function to use for formatting the numeric values
'''
ColumnsTable = db.get_model_for_table('columns', system_table=True)
qs = ColumnsTable.objects_in(db).filter(database=db.db_name, table=tbl_name)
tuples = [(getattr(col, by_field), col.name) for col in qs]
return _generate_piechart(tuples, value_formatter)
def _get_top_tuples(tuples, n=15):
'''
Given a list of tuples (value, name), this function sorts
the list and returns only the top n results. All other tuples
are aggregated to a single "others" tuple.
'''
non_zero_tuples = [t for t in tuples if t[0]]
sorted_tuples = sorted(non_zero_tuples, reverse=True)
if len(sorted_tuples) > n:
others = (sum(t[0] for t in sorted_tuples[n:]), 'others')
sorted_tuples = sorted_tuples[:n] + [others]
return sorted_tuples
def _generate_piechart(tuples, value_formatter):
'''
Generates a pie chart.
`tuples` - a list of (value, name) tuples to include in the chart
`value_formatter` - a function to use for formatting the values
'''
style = RotateStyle('#9e6ffe', background='white', legend_font_family='Roboto', legend_font_size=18, tooltip_font_family='Roboto', tooltip_font_size=24)
chart = pygal.Pie(style=style, margin=0, title=' ', value_formatter=value_formatter, truncate_legend=-1)
for t in _get_top_tuples(tuples):
chart.add(t[1], t[0])
return chart.render(is_unicode=True, disable_xml_declaration=True)

View File

@ -0,0 +1,15 @@
certifi==2020.4.5.2
chardet==3.0.4
click==7.1.2
Flask==1.1.2
idna==2.9
infi.clickhouse-orm==2.0.1
iso8601==0.1.12
itsdangerous==1.1.0
Jinja2==2.11.2
MarkupSafe==1.1.1
pygal==2.4.0
pytz==2020.1
requests==2.23.0
urllib3==1.25.9
Werkzeug==1.0.1

View File

@ -0,0 +1,87 @@
from infi.clickhouse_orm import Database, F
from charts import tables_piechart, columns_piechart, number_formatter, bytes_formatter
from flask import Flask
from flask import render_template
import sys
app = Flask(__name__)
@app.route('/')
def homepage_view():
'''
Root view that lists all databases.
'''
db = _get_db('system')
# Get all databases in the system.databases table
DatabasesTable = db.get_model_for_table('databases', system_table=True)
databases = DatabasesTable.objects_in(db).exclude(name='system')
databases = databases.order_by(F.lower(DatabasesTable.name))
# Generate the page
return render_template('homepage.html', db=db, databases=databases)
@app.route('/<db_name>/')
def database_view(db_name):
'''
A view that displays information about a single database.
'''
db = _get_db(db_name)
# Get all the tables in the database, by aggregating information from system.columns
ColumnsTable = db.get_model_for_table('columns', system_table=True)
tables = ColumnsTable.objects_in(db).filter(database=db_name).aggregate(
ColumnsTable.table,
compressed_size=F.sum(ColumnsTable.data_compressed_bytes),
uncompressed_size=F.sum(ColumnsTable.data_uncompressed_bytes),
ratio=F.sum(ColumnsTable.data_uncompressed_bytes) / F.sum(ColumnsTable.data_compressed_bytes)
)
tables = tables.order_by(F.lower(ColumnsTable.table))
# Generate the page
return render_template('database.html',
db=db,
tables=tables,
tables_piechart_by_rows=tables_piechart(db, 'total_rows', value_formatter=number_formatter),
tables_piechart_by_size=tables_piechart(db, 'total_bytes', value_formatter=bytes_formatter),
)
@app.route('/<db_name>/<tbl_name>/')
def table_view(db_name, tbl_name):
'''
A view that displays information about a single table.
'''
db = _get_db(db_name)
# Get table information from system.tables
TablesTable = db.get_model_for_table('tables', system_table=True)
tbl_info = TablesTable.objects_in(db).filter(database=db_name, name=tbl_name)[0]
# Get the SQL used for creating the table
create_table_sql = db.raw('SHOW CREATE TABLE %s FORMAT TabSeparatedRaw' % tbl_name)
# Get all columns in the table from system.columns
ColumnsTable = db.get_model_for_table('columns', system_table=True)
columns = ColumnsTable.objects_in(db).filter(database=db_name, table=tbl_name)
# Generate the page
return render_template('table.html',
db=db,
tbl_name=tbl_name,
tbl_info=tbl_info,
create_table_sql=create_table_sql,
columns=columns,
piechart=columns_piechart(db, tbl_name, 'data_compressed_bytes', value_formatter=bytes_formatter),
)
def _get_db(db_name):
'''
Returns a Database instance using connection information
from the command line arguments (optional).
'''
db_url = sys.argv[1] if len(sys.argv) > 1 else 'http://localhost:8123/'
username = sys.argv[2] if len(sys.argv) > 2 else None
password = sys.argv[3] if len(sys.argv) > 3 else None
return Database(db_name, db_url, username, password, readonly=True)
if __name__ == '__main__':
_get_db('system') # fail early on db connection problems
app.run(debug=True)

View File

@ -0,0 +1,22 @@
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<title>ClickHouse Explorer</title>
<link rel="icon" href="data:,">
<link rel="stylesheet" href="https://fonts.googleapis.com/css?family=Roboto:300,300italic,700,700italic">
<link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/normalize/8.0.1/normalize.css">
<link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/milligram/1.4.0/milligram.css">
<script type="text/javascript" src="http://kozea.github.com/pygal.js/latest/pygal-tooltips.min.js"></script>
</head>
<body>
<div class="container">
{% block contents %}
{% endblock %}
</div>
</body>
</html>

View File

@ -0,0 +1,54 @@
{% extends "base.html" %}
{% block contents %}
<h1>{{ db.db_name }}</h1>
<p>
<a href="..">Home</a>
&raquo;
{{ db.db_name }}
</p>
<div class="row">
<div class="column">
<h2>Top Tables by Size</h2>
{% autoescape false %}
{{ tables_piechart_by_size }}
{% endautoescape %}
</div>
<div class="column">
<h2>Top Tables by Rows</h2>
{% autoescape false %}
{{ tables_piechart_by_rows }}
{% endautoescape %}
</div>
</div>
<h2>Tables ({{ tables.count() }})</h2>
<table>
<thead>
<tr>
<th>Name</th>
<th>Uncompressed Size</th>
<th>Compressed Size</th>
<th>Compression Ratio</th>
</tr>
</thead>
<tbody>
{% for table in tables %}
<tr>
<td><a href="{{ table.table|urlencode }}/">{{ table.table }}</a></th>
<td>{{ table.uncompressed_size|filesizeformat(true) }}</td>
<td>{{ table.compressed_size|filesizeformat(true) }}</td>
<td>{% if table.uncompressed_size %} {{ "%.2f" % table.ratio }} {% else %} 1 {% endif %} : 1</td>
</tr>
{% endfor %}
</tbody>
</table>
{% endblock %}

View File

@ -0,0 +1,41 @@
{% extends "base.html" %}
{% block contents %}
<div class="row">
<div class="column-50">
<h1>ClickHouse Explorer</h1>
<table>
<tr>
<th>URL</th>
<td>{{ db.db_url }}</td>
</tr>
<tr>
<th>Version</th>
<td>{{ db.server_version|join('.') }}</td>
</tr>
<tr>
<th>Timezone</th>
<td>{{ db.server_timezone }}</td>
</tr>
</table>
<h2>Databases ({{ databases.count() }})</h2>
<ul>
{% for d in databases %}
<li>
<a href="{{ d.name|urlencode }}/">{{ d.name }}</a>
</li>
{% endfor %}
</ul>
</div>
</div>
{% endblock %}

View File

@ -0,0 +1,79 @@
{% extends "base.html" %}
{% block contents %}
<p>
<a href="../..">Home</a>
&raquo;
<a href="..">{{ db.db_name }}</a>
&raquo;
{{ tbl_name }}
</p>
<h1>{{ tbl_name }}</h1>
<div class="row">
<div class="column">
<h2>Details</h2>
<table>
<tr>
<th>Total rows</th>
<td>{{ "{:,}".format(tbl_info.total_rows) }}</td>
</tr>
<tr>
<th>Total size</th>
<td>{{ tbl_info.total_bytes|filesizeformat(true) }}</td>
</tr>
{% if tbl_info.total_rows %}
<tr>
<th>Average row size</th>
<td>{{ (tbl_info.total_bytes / tbl_info.total_rows)|filesizeformat(true) }}</td>
</tr>
{% endif %}
<tr>
<th>Engine</th>
<td>{{ tbl_info.engine }}</td>
</tr>
</table>
</div>
<div class="column">
<h2>Top Columns by Size</h2>
{% autoescape false %}
{{ piechart }}
{% endautoescape %}
</div>
</div>
<h2>Columns ({{ columns.count() }})</h2>
<table>
<thead>
<tr>
<th>Name</th>
<th>Type</th>
<th>Uncompressed Size</th>
<th>Compressed Size</th>
<th>Compression Ratio</th>
</tr>
</thead>
<tbody>
{% for col in columns %}
<tr>
<td>{{ col.name }}</td>
<td>{{ col.type }}</td>
<td>{{ col.data_uncompressed_bytes|filesizeformat(true) }}</td>
<td>{{ col.data_compressed_bytes|filesizeformat(true) }}</td>
<td>{% if col.data_compressed_bytes %} {{ "%.2f" % (col.data_uncompressed_bytes / col.data_compressed_bytes) }} {% else %} 1 {% endif %} : 1</td>
</tr>
{% endfor %}
</tbody>
</table>
<h2>Table Definition</h2>
<pre><code>{{ create_table_sql }}</code></pre>
{% endblock %}

2
examples/full_text_search/.gitignore vendored Normal file
View File

@ -0,0 +1,2 @@
/ebooks/
/env/

View File

@ -0,0 +1,80 @@
# Full Text Search
This example shows how ClickHouse might be used for searching for word sequences in texts. It's a nice proof of concept, but for production use there are probably better solutions, such as Elasticsearch.
## Running the code
Create a virtualenv and install the required libraries:
```
virtualenv -p python3.6 env
source env/bin/activate
pip install -r requirements.txt
```
Run the `download_ebooks` script to download a dozen classical books from [The Gutenberg Project](http://www.gutenberg.org/):
```
python download_ebooks.py
```
Run the `load` script to populate the database with the downloaded texts:
```
python load.py
```
And finally, run the full text search:
```
python search.py "cheshire cat"
```
Asterisks can be used as wildcards (each asterisk stands for one word):
```
python search.py "much * than"
```
## How it works
The `models.py` file defines an ORM model for storing each word in the indexed texts:
```python
class Fragment(Model):
language = LowCardinalityField(StringField(default='EN'))
document = LowCardinalityField(StringField())
idx = UInt64Field()
word = StringField()
stem = StringField()
# An index for faster search by document and fragment idx
index = Index((document, idx), type=Index.minmax(), granularity=1)
# The primary key allows efficient lookup of stems
engine = MergeTree(order_by=(stem, document, idx), partition_key=('language',))
```
The `document` (name) and `idx` (running number of the word inside the document) fields identify the specific word. The `word` field stores the original word as it appears in the text, while the `stem` contains the word after normalization, and that's the field which is used for matching the search terms. Stemming the words makes the matching less strict, so that searching for "swallowed" will also find documents that mention "swallow" or "swallowing".
Here's what some records in the fragment table might look like:
| language | document | idx | word | stem |
|----------|-------------------------|------|------------------|---------------|
| EN | Moby Dick; or The Whale | 4510 | whenever | whenev |
| EN | Moby Dick; or The Whale | 4511 | it | it |
| EN | Moby Dick; or The Whale | 4512 | is | is |
| EN | Moby Dick; or The Whale | 4513 | a | a |
| EN | Moby Dick; or The Whale | 4514 | damp, | damp |
| EN | Moby Dick; or The Whale | 4515 | drizzly | drizzli |
| EN | Moby Dick; or The Whale | 4516 | November | novemb |
| EN | Moby Dick; or The Whale | 4517 | in | in |
| EN | Moby Dick; or The Whale | 4518 | my | my |
| EN | Moby Dick; or The Whale | 4519 | soul; | soul |
Let's say we're looking for the terms "drizzly November". Finding the first in the sequence (after stemming it) is fast and easy:
```python
query = Fragment.objects_in(db).filter(stem='drizzli').only(Fragment.document, Fragment.idx)
```
We're interested only in the `document` and `idx` fields, since they identify a specific word.
To find the next word in the search terms, we need a subquery similar to the first one, with an additional condition that its index will be one greater than the index of the first word:
```python
subquery = Fragment.objects_in(db).filter(stem='novemb').only(Fragment.document, Fragment.idx)
query = query.filter(F.isIn((Fragment.document, Fragment.idx + 1), subquery))
```
And so on, by adding another subquery for each additional search term we can construct the whole sequence of words.
As for wildcard support, when encountering a wildcard in the search terms we simply skip it - it does not need a subquery (since it can match any word). It only increases the index count so that the query conditions will "skip" one word in the sequence.
The algorithm for building this compound query can be found in the `build_query` function.

View File

@ -0,0 +1,27 @@
import requests
import os
def download_ebook(id):
print(id, end=' ')
# Download the ebook's text
r = requests.get('https://www.gutenberg.org/files/{id}/{id}-0.txt'.format(id=id))
if r.status_code == 404:
print('NOT FOUND, SKIPPING')
return
r.raise_for_status()
# Find the ebook's title
text = r.content.decode('utf-8')
for line in text.splitlines():
if line.startswith('Title:'):
title = line[6:].strip()
print(title)
# Save the ebook
with open('ebooks/{}.txt'.format(title), 'wb') as f:
f.write(r.content)
if __name__ == "__main__":
os.makedirs('ebooks', exist_ok=True)
for i in [1342, 11, 84, 2701, 25525, 1661, 98, 74, 43, 215, 1400, 76]:
download_ebook(i)

View File

@ -0,0 +1,61 @@
import sys
import nltk
from nltk.stem.porter import PorterStemmer
from glob import glob
from infi.clickhouse_orm import Database
from models import Fragment
def trim_punctuation(word):
'''
Trim punctuation characters from the beginning and end of the word
'''
start = end = len(word)
for i in range(len(word)):
if word[i].isalnum():
start = min(start, i)
end = i + 1
return word[start : end]
def parse_file(filename):
'''
Parses a text file at the give path.
Returns a generator of tuples (original_word, stemmed_word)
The original_word may include punctuation characters.
'''
stemmer = PorterStemmer()
with open(filename, 'r', encoding='utf-8') as f:
for line in f:
for word in line.split():
yield (word, stemmer.stem(trim_punctuation(word)))
def get_fragments(filename):
'''
Converts a text file at the given path to a generator
of Fragment instances.
'''
from os import path
document = path.splitext(path.basename(filename))[0]
idx = 0
for word, stem in parse_file(filename):
idx += 1
yield Fragment(document=document, idx=idx, word=word, stem=stem)
print('{} - {} words'.format(filename, idx))
if __name__ == '__main__':
# Load NLTK data if necessary
nltk.download('punkt')
nltk.download('wordnet')
# Initialize database
db = Database('default')
db.create_table(Fragment)
# Load files from the command line or everything under ebooks/
filenames = sys.argv[1:] or glob('ebooks/*.txt')
for filename in filenames:
db.insert(get_fragments(filename), batch_size=100000)

View File

@ -0,0 +1,16 @@
from infi.clickhouse_orm import *
class Fragment(Model):
language = LowCardinalityField(StringField(), default='EN')
document = LowCardinalityField(StringField())
idx = UInt64Field()
word = StringField()
stem = StringField()
# An index for faster search by document and fragment idx
index = Index((document, idx), type=Index.minmax(), granularity=1)
# The primary key allows efficient lookup of stems
engine = MergeTree(order_by=(stem, document, idx), partition_key=('language',))

View File

@ -0,0 +1,4 @@
infi.clickhouse_orm
nltk
requests
colorama

View File

@ -0,0 +1,90 @@
import sys
from colorama import init, Fore, Back, Style
from nltk.stem.porter import PorterStemmer
from infi.clickhouse_orm import Database, F
from models import Fragment
from load import trim_punctuation
# The wildcard character
WILDCARD = '*'
def prepare_search_terms(text):
'''
Convert the text to search into a list of stemmed words.
'''
stemmer = PorterStemmer()
stems = []
for word in text.split():
if word == WILDCARD:
stems.append(WILDCARD)
else:
stems.append(stemmer.stem(trim_punctuation(word)))
return stems
def build_query(db, stems):
'''
Returns a queryset instance for finding sequences of Fragment instances
that matche the list of stemmed words.
'''
# Start by searching for the first stemmed word
all_fragments = Fragment.objects_in(db)
query = all_fragments.filter(stem=stems[0]).only(Fragment.document, Fragment.idx)
# Add the following words to the queryset
for i, stem in enumerate(stems):
# Skip the first word (it's already in the query), and wildcards
if i == 0 or stem == WILDCARD:
continue
# Create a subquery that finds instances of the i'th word
subquery = all_fragments.filter(stem=stem).only(Fragment.document, Fragment.idx)
# Add it to the query, requiring that it will appear i places away from the first word
query = query.filter(F.isIn((Fragment.document, Fragment.idx + i), subquery))
# Sort the results
query = query.order_by(Fragment.document, Fragment.idx)
return query
def get_matching_text(db, document, from_idx, to_idx, extra=5):
'''
Reconstructs the document text between the given indexes (inclusive),
plus `extra` words before and after the match. The words that are
included in the given range are highlighted in green.
'''
text = []
conds = (Fragment.document == document) & (Fragment.idx >= from_idx - extra) & (Fragment.idx <= to_idx + extra)
for fragment in Fragment.objects_in(db).filter(conds).order_by('document', 'idx'):
word = fragment.word
if fragment.idx == from_idx:
word = Fore.GREEN + word
if fragment.idx == to_idx:
word = word + Style.RESET_ALL
text.append(word)
return ' '.join(text)
def find(db, text):
'''
Performs the search for the given text, and prints out the matches.
'''
stems = prepare_search_terms(text)
query = build_query(db, stems)
print('\n' + Fore.MAGENTA + str(query) + Style.RESET_ALL + '\n')
for match in query:
text = get_matching_text(db, match.document, match.idx, match.idx + len(stems) - 1)
print(Fore.CYAN + match.document + ':' + Style.RESET_ALL, text)
if __name__ == '__main__':
# Initialize colored output
init()
# Initialize database
db = Database('default')
# Search
text = ' '.join(sys.argv[1:])
if text:
find(db, text)

View File

@ -132,7 +132,7 @@ if __name__ == '__main__':
print('===============')
print()
module_doc([database.Database, database.DatabaseException])
module_doc([models.Model, models.BufferModel, models.DistributedModel])
module_doc([models.Model, models.BufferModel, models.MergeModel, models.DistributedModel, models.Constraint, models.Index])
module_doc(sorted([fields.Field] + all_subclasses(fields.Field), key=lambda x: x.__name__), False)
module_doc([engines.Engine] + all_subclasses(engines.Engine), False)
module_doc([query.QuerySet, query.AggregateQuerySet, query.Q])

View File

@ -6,6 +6,7 @@ from calendar import timegm
from decimal import Decimal, localcontext
from uuid import UUID
from logging import getLogger
from pytz import BaseTzInfo
from .utils import escape, parse_array, comma_join, string_or_func, get_subclass_names
from .funcs import F, FunctionOperatorsMixin
from ipaddress import IPv4Address, IPv6Address
@ -86,10 +87,17 @@ class Field(FunctionOperatorsMixin):
- `db`: Database, used for checking supported features.
'''
sql = self.db_type
args = self.get_db_type_args()
if args:
sql += '(%s)' % comma_join(args)
if with_default_expression:
sql += self._extra_params(db)
return sql
def get_db_type_args(self):
"""Returns field type arguments"""
return []
def _extra_params(self, db):
sql = ''
if self.alias:
@ -187,9 +195,23 @@ class DateTimeField(Field):
class_default = datetime.datetime.fromtimestamp(0, pytz.utc)
db_type = 'DateTime'
def __init__(self, default=None, alias=None, materialized=None, readonly=None, codec=None,
timezone=None):
super().__init__(default, alias, materialized, readonly, codec)
# assert not timezone, 'Temporarily field timezone is not supported'
if timezone:
timezone = timezone if isinstance(timezone, BaseTzInfo) else pytz.timezone(timezone)
self.timezone = timezone
def get_db_type_args(self):
args = []
if self.timezone:
args.append(escape(self.timezone.zone))
return args
def to_python(self, value, timezone_in_use):
if isinstance(value, datetime.datetime):
return value.astimezone(pytz.utc) if value.tzinfo else value.replace(tzinfo=pytz.utc)
return value if value.tzinfo else value.replace(tzinfo=pytz.utc)
if isinstance(value, datetime.date):
return datetime.datetime(value.year, value.month, value.day, tzinfo=pytz.utc)
if isinstance(value, int):
@ -212,13 +234,61 @@ class DateTimeField(Field):
# convert naive to aware
if dt.tzinfo is None or dt.tzinfo.utcoffset(dt) is None:
dt = timezone_in_use.localize(dt)
return dt.astimezone(pytz.utc)
return dt
raise ValueError('Invalid value for %s - %r' % (self.__class__.__name__, value))
def to_db_string(self, value, quote=True):
return escape('%010d' % timegm(value.utctimetuple()), quote)
class DateTime64Field(DateTimeField):
db_type = 'DateTime64'
def __init__(self, default=None, alias=None, materialized=None, readonly=None, codec=None,
timezone=None, precision=6):
super().__init__(default, alias, materialized, readonly, codec, timezone)
assert precision is None or isinstance(precision, int), 'Precision must be int type'
self.precision = precision
def get_db_type_args(self):
args = [str(self.precision)]
if self.timezone:
args.append(escape(self.timezone.zone))
return args
def to_db_string(self, value, quote=True):
"""
Returns the field's value prepared for writing to the database
Returns string in 0000000000.000000 format, where remainder digits count is equal to precision
"""
return escape(
'{timestamp:0{width}.{precision}f}'.format(
timestamp=value.timestamp(),
width=11 + self.precision,
precision=self.precision),
quote
)
def to_python(self, value, timezone_in_use):
try:
return super().to_python(value, timezone_in_use)
except ValueError:
if isinstance(value, (int, float)):
return datetime.datetime.utcfromtimestamp(value).replace(tzinfo=pytz.utc)
if isinstance(value, str):
left_part = value.split('.')[0]
if left_part == '0000-00-00 00:00:00':
return self.class_default
if len(left_part) == 10:
try:
value = float(value)
return datetime.datetime.utcfromtimestamp(value).replace(tzinfo=pytz.utc)
except ValueError:
pass
raise
class BaseIntField(Field):
'''
Abstract base class for all integer-type fields.
@ -410,15 +480,8 @@ class BaseEnumField(Field):
def to_db_string(self, value, quote=True):
return escape(value.name, quote)
def get_sql(self, with_default_expression=True, db=None):
values = ['%s = %d' % (escape(item.name), item.value) for item in self.enum_cls]
sql = '%s(%s)' % (self.db_type, ' ,'.join(values))
if with_default_expression:
default = self.to_db_string(self.default)
sql = '%s DEFAULT %s' % (sql, default)
if self.codec and db and db.has_codec_support:
sql+= ' CODEC(%s)' % self.codec
return sql
def get_db_type_args(self):
return ['%s = %d' % (escape(item.name), item.value) for item in self.enum_cls]
@classmethod
def create_ad_hoc_field(cls, db_type):

View File

@ -1,9 +1,8 @@
from datetime import date, datetime, tzinfo, timedelta
from functools import wraps
from inspect import signature, Parameter
from types import FunctionType
from .utils import is_iterable, comma_join, NO_VALUE
from .utils import is_iterable, comma_join, NO_VALUE, arg_to_sql
from .query import Cond, QuerySet
@ -263,43 +262,9 @@ class F(Cond, FunctionOperatorsMixin, metaclass=FMeta):
else:
prefix = self.name
sep = ', '
arg_strs = (F._arg_to_sql(arg) for arg in self.args if arg != NO_VALUE)
arg_strs = (arg_to_sql(arg) for arg in self.args if arg != NO_VALUE)
return prefix + '(' + sep.join(arg_strs) + ')'
@staticmethod
def _arg_to_sql(arg):
"""
Converts a function argument to SQL string according to its type.
Supports functions, model fields, strings, dates, datetimes, timedeltas, booleans,
None, numbers, timezones, arrays/iterables.
"""
from .fields import Field, StringField, DateTimeField, DateField
if isinstance(arg, F):
return arg.to_sql()
if isinstance(arg, Field):
return "`%s`" % arg
if isinstance(arg, str):
return StringField().to_db_string(arg)
if isinstance(arg, datetime):
return "toDateTime(%s)" % DateTimeField().to_db_string(arg)
if isinstance(arg, date):
return "toDate('%s')" % arg.isoformat()
if isinstance(arg, timedelta):
return "toIntervalSecond(%d)" % int(arg.total_seconds())
if isinstance(arg, bool):
return str(int(arg))
if isinstance(arg, tzinfo):
return StringField().to_db_string(arg.tzname(None))
if arg is None:
return 'NULL'
if isinstance(arg, QuerySet):
return "(%s)" % arg
if isinstance(arg, tuple):
return '(' + comma_join(F._arg_to_sql(x) for x in arg) + ')'
if is_iterable(arg):
return '[' + comma_join(F._arg_to_sql(x) for x in arg) + ']'
return str(arg)
# Arithmetic functions
@staticmethod
@ -767,6 +732,11 @@ class F(Cond, FunctionOperatorsMixin, metaclass=FMeta):
def toDateTime(x):
return F('toDateTime', x)
@staticmethod
@type_conversion
def toDateTime64(x, precision, timezone=NO_VALUE):
return F('toDateTime64', x, precision, timezone)
@staticmethod
def toString(x):
return F('toString', x)
@ -1819,6 +1789,28 @@ class F(Cond, FunctionOperatorsMixin, metaclass=FMeta):
def greatest(x, y):
return F('greatest', x, y)
# Dictionary functions
@staticmethod
def dictGet(dict_name, attr_name, id_expr):
return F('dictGet', dict_name, attr_name, id_expr)
@staticmethod
def dictGetOrDefault(dict_name, attr_name, id_expr, default):
return F('dictGetOrDefault', dict_name, attr_name, id_expr, default)
@staticmethod
def dictHas(dict_name, id_expr):
return F('dictHas', dict_name, id_expr)
@staticmethod
def dictGetHierarchy(dict_name, id_expr):
return F('dictGetHierarchy', dict_name, id_expr)
@staticmethod
def dictIsIn(dict_name, child_id_expr, ancestor_id_expr):
return F('dictIsIn', dict_name, child_id_expr, ancestor_id_expr)
# Expose only relevant classes in import *
__all__ = ['F']

View File

@ -7,7 +7,7 @@ import logging
logger = logging.getLogger('migrations')
class Operation(object):
class Operation():
'''
Base class for migration operations.
'''
@ -16,22 +16,40 @@ class Operation(object):
raise NotImplementedError() # pragma: no cover
class CreateTable(Operation):
class ModelOperation(Operation):
'''
Base class for migration operations that work on a specific model.
'''
def __init__(self, model_class):
'''
Initializer.
'''
self.model_class = model_class
self.table_name = model_class.table_name()
def _alter_table(self, database, cmd):
'''
Utility for running ALTER TABLE commands.
'''
cmd = "ALTER TABLE $db.`%s` %s" % (self.table_name, cmd)
logger.debug(cmd)
database.raw(cmd)
class CreateTable(ModelOperation):
'''
A migration operation that creates a table for a given model class.
'''
def __init__(self, model_class):
self.model_class = model_class
def apply(self, database):
logger.info(' Create table %s', self.model_class.table_name())
logger.info(' Create table %s', self.table_name)
if issubclass(self.model_class, BufferModel):
database.create_table(self.model_class.engine.main_model)
database.create_table(self.model_class)
class AlterTable(Operation):
class AlterTable(ModelOperation):
'''
A migration operation that compares the table of a given model class to
the model's fields, and alters the table to match the model. The operation can:
@ -41,20 +59,12 @@ class AlterTable(Operation):
Default values are not altered by this operation.
'''
def __init__(self, model_class):
self.model_class = model_class
def _get_table_fields(self, database):
query = "DESC `%s`.`%s`" % (database.db_name, self.model_class.table_name())
query = "DESC `%s`.`%s`" % (database.db_name, self.table_name)
return [(row.name, row.type) for row in database.select(query)]
def _alter_table(self, database, cmd):
cmd = "ALTER TABLE `%s`.`%s` %s" % (database.db_name, self.model_class.table_name(), cmd)
logger.debug(cmd)
database._send(cmd)
def apply(self, database):
logger.info(' Alter table %s', self.model_class.table_name())
logger.info(' Alter table %s', self.table_name)
# Note that MATERIALIZED and ALIAS fields are always at the end of the DESC,
# ADD COLUMN ... AFTER doesn't affect it
@ -100,16 +110,13 @@ class AlterTable(Operation):
self._alter_table(database, 'MODIFY COLUMN %s %s' % (field_name, model_fields[field_name]))
class AlterTableWithBuffer(Operation):
class AlterTableWithBuffer(ModelOperation):
'''
A migration operation for altering a buffer table and its underlying on-disk table.
The buffer table is dropped, the on-disk table is altered, and then the buffer table
is re-created.
'''
def __init__(self, model_class):
self.model_class = model_class
def apply(self, database):
if issubclass(self.model_class, BufferModel):
DropTable(self.model_class).apply(database)
@ -119,25 +126,108 @@ class AlterTableWithBuffer(Operation):
AlterTable(self.model_class).apply(database)
class DropTable(Operation):
class DropTable(ModelOperation):
'''
A migration operation that drops the table of a given model class.
'''
def __init__(self, model_class):
self.model_class = model_class
def apply(self, database):
logger.info(' Drop table %s', self.table_name)
database.drop_table(self.model_class)
class AlterConstraints(ModelOperation):
'''
A migration operation that adds new constraints from the model to the database
table, and drops obsolete ones. Constraints are identified by their names, so
a change in an existing constraint will not be detected unless its name was changed too.
ClickHouse does not check that the constraints hold for existing data in the table.
'''
def apply(self, database):
logger.info(' Drop table %s', self.model_class.table_name())
database.drop_table(self.model_class)
logger.info(' Alter constraints for %s', self.table_name)
existing = self._get_constraint_names(database)
# Go over constraints in the model
for constraint in self.model_class._constraints.values():
# Check if it's a new constraint
if constraint.name not in existing:
logger.info(' Add constraint %s', constraint.name)
self._alter_table(database, 'ADD %s' % constraint.create_table_sql())
else:
existing.remove(constraint.name)
# Remaining constraints in `existing` are obsolete
for name in existing:
logger.info(' Drop constraint %s', name)
self._alter_table(database, 'DROP CONSTRAINT `%s`' % name)
def _get_constraint_names(self, database):
'''
Returns a set containing the names of existing constraints in the table.
'''
import re
table_def = database.raw('SHOW CREATE TABLE $db.`%s`' % self.table_name)
matches = re.findall(r'\sCONSTRAINT\s+`?(.+?)`?\s+CHECK\s', table_def)
return set(matches)
class AlterIndexes(ModelOperation):
'''
A migration operation that adds new indexes from the model to the database
table, and drops obsolete ones. Indexes are identified by their names, so
a change in an existing index will not be detected unless its name was changed too.
'''
def __init__(self, model_class, reindex=False):
'''
Initializer.
By default ClickHouse does not build indexes over existing data, only for
new data. Passing `reindex=True` will run `OPTIMIZE TABLE` in order to build
the indexes over the existing data.
'''
super().__init__(model_class)
self.reindex = reindex
def apply(self, database):
logger.info(' Alter indexes for %s', self.table_name)
existing = self._get_index_names(database)
logger.info(existing)
# Go over indexes in the model
for index in self.model_class._indexes.values():
# Check if it's a new index
if index.name not in existing:
logger.info(' Add index %s', index.name)
self._alter_table(database, 'ADD %s' % index.create_table_sql())
else:
existing.remove(index.name)
# Remaining indexes in `existing` are obsolete
for name in existing:
logger.info(' Drop index %s', name)
self._alter_table(database, 'DROP INDEX `%s`' % name)
# Reindex
if self.reindex:
logger.info(' Build indexes on table')
database.raw('OPTIMIZE TABLE $db.`%s` FINAL' % self.table_name)
def _get_index_names(self, database):
'''
Returns a set containing the names of existing indexes in the table.
'''
import re
table_def = database.raw('SHOW CREATE TABLE $db.`%s`' % self.table_name)
matches = re.findall(r'\sINDEX\s+`?(.+?)`?\s+', table_def)
return set(matches)
class RunPython(Operation):
'''
A migration operation that executes given python function on database
A migration operation that executes a Python function.
'''
def __init__(self, func):
assert callable(func), "'func' parameter must be function"
'''
Initializer. The given Python function will be called with a single
argument - the Database instance to apply the migration to.
'''
assert callable(func), "'func' argument must be function"
self._func = func
def apply(self, database):
@ -147,14 +237,17 @@ class RunPython(Operation):
class RunSQL(Operation):
'''
A migration operation that executes given SQL on database
A migration operation that executes arbitrary SQL statements.
'''
def __init__(self, sql):
'''
Initializer. The given sql argument must be a valid SQL statement or
list of statements.
'''
if isinstance(sql, str):
sql = [sql]
assert isinstance(sql, list), "'sql' parameter must be string or list of strings"
assert isinstance(sql, list), "'sql' argument must be string or list of strings"
self._sql = sql
def apply(self, database):

View File

@ -1,12 +1,13 @@
from __future__ import unicode_literals
import sys
from collections import OrderedDict
from itertools import chain
from logging import getLogger
import pytz
from .fields import Field, StringField
from .utils import parse_tsv, NO_VALUE, get_subclass_names
from .utils import parse_tsv, NO_VALUE, get_subclass_names, arg_to_sql, unescape
from .query import QuerySet
from .funcs import F
from .engines import Merge, Distributed
@ -14,6 +15,110 @@ from .engines import Merge, Distributed
logger = getLogger('clickhouse_orm')
class Constraint:
'''
Defines a model constraint.
'''
name = None # this is set by the parent model
parent = None # this is set by the parent model
def __init__(self, expr):
'''
Initializer. Expects an expression that ClickHouse will verify when inserting data.
'''
self.expr = expr
def create_table_sql(self):
'''
Returns the SQL statement for defining this constraint during table creation.
'''
return 'CONSTRAINT `%s` CHECK %s' % (self.name, arg_to_sql(self.expr))
class Index:
'''
Defines a data-skipping index.
'''
name = None # this is set by the parent model
parent = None # this is set by the parent model
def __init__(self, expr, type, granularity):
'''
Initializer.
- `expr` - a column, expression, or tuple of columns and expressions to index.
- `type` - the index type. Use one of the following methods to specify the type:
`Index.minmax`, `Index.set`, `Index.ngrambf_v1`, `Index.tokenbf_v1` or `Index.bloom_filter`.
- `granularity` - index block size (number of multiples of the `index_granularity` defined by the engine).
'''
self.expr = expr
self.type = type
self.granularity = granularity
def create_table_sql(self):
'''
Returns the SQL statement for defining this index during table creation.
'''
return 'INDEX `%s` %s TYPE %s GRANULARITY %d' % (self.name, arg_to_sql(self.expr), self.type, self.granularity)
@staticmethod
def minmax():
'''
An index that stores extremes of the specified expression (if the expression is tuple, then it stores
extremes for each element of tuple). The stored info is used for skipping blocks of data like the primary key.
'''
return 'minmax'
@staticmethod
def set(max_rows):
'''
An index that stores unique values of the specified expression (no more than max_rows rows,
or unlimited if max_rows=0). Uses the values to check if the WHERE expression is not satisfiable
on a block of data.
'''
return 'set(%d)' % max_rows
@staticmethod
def ngrambf_v1(n, size_of_bloom_filter_in_bytes, number_of_hash_functions, random_seed):
'''
An index that stores a Bloom filter containing all ngrams from a block of data.
Works only with strings. Can be used for optimization of equals, like and in expressions.
- `n` ngram size
- `size_of_bloom_filter_in_bytes` Bloom filter size in bytes (you can use large values here,
for example 256 or 512, because it can be compressed well).
- `number_of_hash_functions` The number of hash functions used in the Bloom filter.
- `random_seed` The seed for Bloom filter hash functions.
'''
return 'ngrambf_v1(%d, %d, %d, %d)' % (n, size_of_bloom_filter_in_bytes, number_of_hash_functions, random_seed)
@staticmethod
def tokenbf_v1(size_of_bloom_filter_in_bytes, number_of_hash_functions, random_seed):
'''
An index that stores a Bloom filter containing string tokens. Tokens are sequences
separated by non-alphanumeric characters.
- `size_of_bloom_filter_in_bytes` Bloom filter size in bytes (you can use large values here,
for example 256 or 512, because it can be compressed well).
- `number_of_hash_functions` The number of hash functions used in the Bloom filter.
- `random_seed` The seed for Bloom filter hash functions.
'''
return 'tokenbf_v1(%d, %d, %d)' % (size_of_bloom_filter_in_bytes, number_of_hash_functions, random_seed)
@staticmethod
def bloom_filter(false_positive=0.025):
'''
An index that stores a Bloom filter containing values of the index expression.
- `false_positive` - the probability (between 0 and 1) of receiving a false positive
response from the filter
'''
return 'bloom_filter(%f)' % false_positive
class ModelBase(type):
'''
A metaclass for ORM models. It adds the _fields list to model classes.
@ -22,16 +127,27 @@ class ModelBase(type):
ad_hoc_model_cache = {}
def __new__(cls, name, bases, attrs):
# Collect fields from parent classes
base_fields = dict()
# Collect fields, constraints and indexes from parent classes
fields = {}
constraints = {}
indexes = {}
for base in bases:
if isinstance(base, ModelBase):
base_fields.update(base._fields)
fields.update(base._fields)
constraints.update(base._constraints)
indexes.update(base._indexes)
fields = base_fields
# Add fields, constraints and indexes from this class
for n, obj in attrs.items():
if isinstance(obj, Field):
fields[n] = obj
elif isinstance(obj, Constraint):
constraints[n] = obj
elif isinstance(obj, Index):
indexes[n] = obj
# Build a list of fields, in the order they were listed in the class
fields.update({n: f for n, f in attrs.items() if isinstance(f, Field)})
# Convert fields to a list of (name, field) tuples, in the order they were listed in the class
fields = sorted(fields.items(), key=lambda item: item[1].creation_counter)
# Build a dictionary of default values
@ -46,19 +162,22 @@ class ModelBase(type):
else:
defaults[n] = f.to_python(f.default, pytz.UTC)
# Create the model class
attrs = dict(
attrs,
_fields=OrderedDict(fields),
_constraints=constraints,
_indexes=indexes,
_writable_fields=OrderedDict([f for f in fields if not f[1].readonly]),
_defaults=defaults,
_has_funcs_as_defaults=has_funcs_as_defaults
)
model = super(ModelBase, cls).__new__(cls, str(name), bases, attrs)
# Let each field know its parent and its own name
for n, f in fields:
setattr(f, 'parent', model)
setattr(f, 'name', n)
# Let each field, constraint and index know its parent and its own name
for n, obj in chain(fields, constraints.items(), indexes.items()):
setattr(obj, 'parent', model)
setattr(obj, 'name', n)
return model
@ -87,8 +206,17 @@ class ModelBase(type):
return orm_fields.BaseEnumField.create_ad_hoc_field(db_type)
# DateTime with timezone
if db_type.startswith('DateTime('):
# Some functions return DateTimeField with timezone in brackets
return orm_fields.DateTimeField()
timezone = db_type[9:-1]
return orm_fields.DateTimeField(
timezone=timezone[1:-1] if timezone else None
)
# DateTime64
if db_type.startswith('DateTime64('):
precision, *timezone = [s.strip() for s in db_type[11:-1].split(',')]
return orm_fields.DateTime64Field(
precision=int(precision),
timezone=timezone[0][1:-1] if timezone else None
)
# Arrays
if db_type.startswith('Array'):
inner_field = cls.create_ad_hoc_field(db_type[6 : -1])
@ -222,13 +350,21 @@ class Model(metaclass=ModelBase):
@classmethod
def create_table_sql(cls, db):
'''
Returns the SQL command for creating a table for this model.
Returns the SQL statement for creating a table for this model.
'''
parts = ['CREATE TABLE IF NOT EXISTS `%s`.`%s` (' % (db.db_name, cls.table_name())]
cols = []
# Fields
items = []
for name, field in cls.fields().items():
cols.append(' %s %s' % (name, field.get_sql(db=db)))
parts.append(',\n'.join(cols))
items.append(' %s %s' % (name, field.get_sql(db=db)))
# Constraints
for c in cls._constraints.values():
items.append(' %s' % c.create_table_sql())
# Indexes
for i in cls._indexes.values():
items.append(' %s' % i.create_table_sql())
parts.append(',\n'.join(items))
# Engine
parts.append(')')
parts.append('ENGINE = ' + cls.engine.create_table_sql(db))
return '\n'.join(parts)
@ -248,14 +384,15 @@ class Model(metaclass=ModelBase):
- `line`: the TSV-formatted data.
- `field_names`: names of the model fields in the data.
- `timezone_in_use`: the timezone to use when parsing dates and datetimes.
- `timezone_in_use`: the timezone to use when parsing dates and datetimes. Some fields use their own timezones.
- `database`: if given, sets the database that this instance belongs to.
'''
values = iter(parse_tsv(line))
kwargs = {}
for name in field_names:
field = getattr(cls, name)
kwargs[name] = field.to_python(next(values), timezone_in_use)
field_timezone = getattr(field, 'timezone', None) or timezone_in_use
kwargs[name] = field.to_python(next(values), field_timezone)
obj = cls(**kwargs)
if database is not None:
@ -348,7 +485,7 @@ class BufferModel(Model):
@classmethod
def create_table_sql(cls, db):
'''
Returns the SQL command for creating a table for this model.
Returns the SQL statement for creating a table for this model.
'''
parts = ['CREATE TABLE IF NOT EXISTS `%s`.`%s` AS `%s`.`%s`' % (db.db_name, cls.table_name(), db.db_name,
cls.engine.main_model.table_name())]
@ -370,6 +507,9 @@ class MergeModel(Model):
@classmethod
def create_table_sql(cls, db):
'''
Returns the SQL statement for creating a table for this model.
'''
assert isinstance(cls.engine, Merge), "engine must be an instance of engines.Merge"
parts = ['CREATE TABLE IF NOT EXISTS `%s`.`%s` (' % (db.db_name, cls.table_name())]
cols = []
@ -386,10 +526,14 @@ class MergeModel(Model):
class DistributedModel(Model):
"""
Model for Distributed engine
Model class for use with a `Distributed` engine.
"""
def set_database(self, db):
'''
Sets the `Database` that this model instance belongs to.
This is done automatically when the instance is read from the database or written to it.
'''
assert isinstance(self.engine, Distributed), "engine must be an instance of engines.Distributed"
res = super(DistributedModel, self).set_database(db)
return res
@ -447,6 +591,9 @@ class DistributedModel(Model):
@classmethod
def create_table_sql(cls, db):
'''
Returns the SQL statement for creating a table for this model.
'''
assert isinstance(cls.engine, Distributed), "engine must be engines.Distributed instance"
cls.fix_engine_table()
@ -459,4 +606,4 @@ class DistributedModel(Model):
# Expose only relevant classes in import *
__all__ = get_subclass_names(locals(), Model)
__all__ = get_subclass_names(locals(), (Model, Constraint, Index))

View File

@ -3,9 +3,8 @@ from __future__ import unicode_literals
import pytz
from copy import copy, deepcopy
from math import ceil
from .engines import CollapsingMergeTree
from datetime import date, datetime
from .utils import comma_join, string_or_func
from .utils import comma_join, string_or_func, arg_to_sql
# TODO
@ -538,15 +537,50 @@ class QuerySet(object):
def final(self):
"""
Adds a FINAL modifier to table, meaning data will be collapsed to final version.
Can be used with `CollapsingMergeTree` engine only.
Can be used with the `CollapsingMergeTree` and `ReplacingMergeTree` engines only.
"""
if not isinstance(self._model_cls.engine, CollapsingMergeTree):
raise TypeError('final() method can be used only with CollapsingMergeTree engine')
from .engines import CollapsingMergeTree, ReplacingMergeTree
if not isinstance(self._model_cls.engine, (CollapsingMergeTree, ReplacingMergeTree)):
raise TypeError('final() method can be used only with the CollapsingMergeTree and ReplacingMergeTree engines')
qs = copy(self)
qs._final = True
return qs
def delete(self):
"""
Deletes all records matched by this queryset's conditions.
Note that ClickHouse performs deletions in the background, so they are not immediate.
"""
self._verify_mutation_allowed()
conditions = (self._where_q & self._prewhere_q).to_sql(self._model_cls)
sql = 'ALTER TABLE $db.`%s` DELETE WHERE %s' % (self._model_cls.table_name(), conditions)
self._database.raw(sql)
return self
def update(self, **kwargs):
"""
Updates all records matched by this queryset's conditions.
Keyword arguments specify the field names and expressions to use for the update.
Note that ClickHouse performs updates in the background, so they are not immediate.
"""
assert kwargs, 'No fields specified for update'
self._verify_mutation_allowed()
fields = comma_join('`%s` = %s' % (name, arg_to_sql(expr)) for name, expr in kwargs.items())
conditions = (self._where_q & self._prewhere_q).to_sql(self._model_cls)
sql = 'ALTER TABLE $db.`%s` UPDATE %s WHERE %s' % (self._model_cls.table_name(), fields, conditions)
self._database.raw(sql)
return self
def _verify_mutation_allowed(self):
'''
Checks that the queryset's state allows mutations. Raises an AssertionError if not.
'''
assert not self._limits, 'Mutations are not allowed after slicing the queryset'
assert not self._limit_by, 'Mutations are not allowed after calling limit_by(...)'
assert not self._distinct, 'Mutations are not allowed after calling distinct()'
assert not self._final, 'Mutations are not allowed after calling final()'
def aggregate(self, *args, **kwargs):
"""
Returns an `AggregateQuerySet` over this query, with `args` serving as
@ -647,6 +681,9 @@ class AggregateQuerySet(QuerySet):
qs._grouping_with_totals = True
return qs
def _verify_mutation_allowed(self):
raise AssertionError('Cannot mutate an AggregateQuerySet')
# Expose only relevant classes in import *
__all__ = [c.__name__ for c in [Q, QuerySet, AggregateQuerySet]]

View File

@ -1,6 +1,6 @@
from __future__ import unicode_literals
import codecs
import re
from datetime import date, datetime, tzinfo, timedelta
SPECIAL_CHARS = {
@ -42,6 +42,40 @@ def string_or_func(obj):
return obj.to_sql() if hasattr(obj, 'to_sql') else obj
def arg_to_sql(arg):
"""
Converts a function argument to SQL string according to its type.
Supports functions, model fields, strings, dates, datetimes, timedeltas, booleans,
None, numbers, timezones, arrays/iterables.
"""
from infi.clickhouse_orm import Field, StringField, DateTimeField, DateField, F, QuerySet
if isinstance(arg, F):
return arg.to_sql()
if isinstance(arg, Field):
return "`%s`" % arg
if isinstance(arg, str):
return StringField().to_db_string(arg)
if isinstance(arg, datetime):
return "toDateTime(%s)" % DateTimeField().to_db_string(arg)
if isinstance(arg, date):
return "toDate('%s')" % arg.isoformat()
if isinstance(arg, timedelta):
return "toIntervalSecond(%d)" % int(arg.total_seconds())
if isinstance(arg, bool):
return str(int(arg))
if isinstance(arg, tzinfo):
return StringField().to_db_string(arg.tzname(None))
if arg is None:
return 'NULL'
if isinstance(arg, QuerySet):
return "(%s)" % arg
if isinstance(arg, tuple):
return '(' + comma_join(arg_to_sql(x) for x in arg) + ')'
if is_iterable(arg):
return '[' + comma_join(arg_to_sql(x) for x in arg) + ']'
return str(arg)
def parse_tsv(line):
if isinstance(line, bytes):
line = line.decode()

View File

@ -1,5 +1,4 @@
# -*- coding: utf-8 -*-
from __future__ import unicode_literals
import unittest
from infi.clickhouse_orm.database import Database
@ -21,6 +20,10 @@ class TestCaseWithData(unittest.TestCase):
self.database.drop_table(Person)
self.database.drop_database()
def _insert_all(self):
self.database.insert(self._sample_data())
self.assertTrue(self.database.count(Person))
def _insert_and_check(self, data, count, batch_size=1000):
self.database.insert(data, batch_size=batch_size)
self.assertEqual(count, self.database.count(Person))
@ -32,6 +35,7 @@ class TestCaseWithData(unittest.TestCase):
yield Person(**entry)
class Person(Model):
first_name = StringField()

View File

@ -0,0 +1,6 @@
from infi.clickhouse_orm import migrations
from ..test_migrations import *
operations = [
migrations.CreateTable(ModelWithConstraints)
]

View File

@ -0,0 +1,6 @@
from infi.clickhouse_orm import migrations
from ..test_migrations import *
operations = [
migrations.AlterConstraints(ModelWithConstraints2)
]

View File

@ -0,0 +1,6 @@
from infi.clickhouse_orm import migrations
from ..test_migrations import *
operations = [
migrations.CreateTable(ModelWithIndex)
]

View File

@ -0,0 +1,6 @@
from infi.clickhouse_orm import migrations
from ..test_migrations import *
operations = [
migrations.AlterIndexes(ModelWithIndex2, reindex=True)
]

View File

@ -1,4 +1,3 @@
from __future__ import unicode_literals
import unittest
from datetime import date

View File

@ -1,4 +1,3 @@
from __future__ import unicode_literals
import unittest
from datetime import date

View File

@ -1,5 +1,4 @@
# -*- coding: utf-8 -*-
from __future__ import unicode_literals
import unittest
from infi.clickhouse_orm.models import BufferModel

View File

@ -1,4 +1,3 @@
from __future__ import unicode_literals
import unittest
import datetime
import pytz

44
tests/test_constraints.py Normal file
View File

@ -0,0 +1,44 @@
import unittest
from infi.clickhouse_orm import *
from .base_test_with_data import Person
class ConstraintsTest(unittest.TestCase):
def setUp(self):
self.database = Database('test-db', log_statements=True)
if self.database.server_version < (19, 14, 3, 3):
raise unittest.SkipTest('ClickHouse version too old')
self.database.create_table(PersonWithConstraints)
def tearDown(self):
self.database.drop_database()
def test_insert_valid_values(self):
self.database.insert([
PersonWithConstraints(first_name="Mike", last_name="Caruzo", birthday="2000-01-01", height=1.66)
])
def test_insert_invalid_values(self):
with self.assertRaises(ServerError) as e:
self.database.insert([
PersonWithConstraints(first_name="Mike", last_name="Caruzo", birthday="2100-01-01", height=1.66)
])
self.assertEqual(e.code, 469)
self.assertTrue('Constraint `birthday_in_the_past`' in e.message)
with self.assertRaises(ServerError) as e:
self.database.insert([
PersonWithConstraints(first_name="Mike", last_name="Caruzo", birthday="1970-01-01", height=3)
])
self.assertEqual(e.code, 469)
self.assertTrue('Constraint `max_height`' in e.message)
class PersonWithConstraints(Person):
birthday_in_the_past = Constraint(Person.birthday <= F.today())
max_height = Constraint(Person.height <= 2.75)

View File

@ -1,4 +1,3 @@
from __future__ import unicode_literals
import unittest
from infi.clickhouse_orm.database import Database
from infi.clickhouse_orm.fields import Field, Int16Field

View File

@ -1,5 +1,4 @@
# -*- coding: utf-8 -*-
from __future__ import unicode_literals
import unittest
import datetime

View File

@ -1,4 +1,3 @@
from __future__ import unicode_literals
import unittest
import datetime
import pytz
@ -13,6 +12,8 @@ class DateFieldsTest(unittest.TestCase):
def setUp(self):
self.database = Database('test-db', log_statements=True)
if self.database.server_version < (20, 1, 2, 4):
raise unittest.SkipTest('ClickHouse version too old')
self.database.create_table(ModelWithDate)
def tearDown(self):
@ -20,8 +21,17 @@ class DateFieldsTest(unittest.TestCase):
def test_ad_hoc_model(self):
self.database.insert([
ModelWithDate(date_field='2016-08-30', datetime_field='2016-08-30 03:50:00'),
ModelWithDate(date_field='2016-08-31', datetime_field='2016-08-31 01:30:00')
ModelWithDate(
date_field='2016-08-30',
datetime_field='2016-08-30 03:50:00',
datetime64_field='2016-08-30 03:50:00.123456',
datetime64_3_field='2016-08-30 03:50:00.123456'
),
ModelWithDate(
date_field='2016-08-31',
datetime_field='2016-08-31 01:30:00',
datetime64_field='2016-08-31 01:30:00.123456',
datetime64_3_field='2016-08-31 01:30:00.123456')
])
# toStartOfHour returns DateTime('Asia/Yekaterinburg') in my case, so I test it here to
@ -35,10 +45,75 @@ class DateFieldsTest(unittest.TestCase):
self.assertEqual(results[1].datetime_field, datetime.datetime(2016, 8, 31, 1, 30, 0, tzinfo=pytz.UTC))
self.assertEqual(results[1].hour_start, datetime.datetime(2016, 8, 31, 1, 0, 0, tzinfo=pytz.UTC))
self.assertEqual(results[0].datetime64_field, datetime.datetime(2016, 8, 30, 3, 50, 0, 123456, tzinfo=pytz.UTC))
self.assertEqual(results[0].datetime64_3_field, datetime.datetime(2016, 8, 30, 3, 50, 0, 123000,
tzinfo=pytz.UTC))
self.assertEqual(results[1].datetime64_field, datetime.datetime(2016, 8, 31, 1, 30, 0, 123456, tzinfo=pytz.UTC))
self.assertEqual(results[1].datetime64_3_field, datetime.datetime(2016, 8, 31, 1, 30, 0, 123000,
tzinfo=pytz.UTC))
class ModelWithDate(Model):
date_field = DateField()
datetime_field = DateTimeField()
datetime64_field = DateTime64Field()
datetime64_3_field = DateTime64Field(precision=3)
engine = MergeTree('date_field', ('date_field',))
class ModelWithTz(Model):
datetime_no_tz_field = DateTimeField() # server tz
datetime_tz_field = DateTimeField(timezone='Europe/Madrid')
datetime64_tz_field = DateTime64Field(timezone='Europe/Madrid')
datetime_utc_field = DateTimeField(timezone=pytz.UTC)
engine = MergeTree('datetime_no_tz_field', ('datetime_no_tz_field',))
class DateTimeFieldWithTzTest(unittest.TestCase):
def setUp(self):
self.database = Database('test-db', log_statements=True)
if self.database.server_version < (20, 1, 2, 4):
raise unittest.SkipTest('ClickHouse version too old')
self.database.create_table(ModelWithTz)
def tearDown(self):
self.database.drop_database()
def test_ad_hoc_model(self):
self.database.insert([
ModelWithTz(
datetime_no_tz_field='2020-06-11 04:00:00',
datetime_tz_field='2020-06-11 04:00:00',
datetime64_tz_field='2020-06-11 04:00:00',
datetime_utc_field='2020-06-11 04:00:00',
),
ModelWithTz(
datetime_no_tz_field='2020-06-11 07:00:00+0300',
datetime_tz_field='2020-06-11 07:00:00+0300',
datetime64_tz_field='2020-06-11 07:00:00+0300',
datetime_utc_field='2020-06-11 07:00:00+0300',
),
])
query = 'SELECT * from $db.modelwithtz ORDER BY datetime_no_tz_field'
results = list(self.database.select(query))
self.assertEqual(results[0].datetime_no_tz_field, datetime.datetime(2020, 6, 11, 4, 0, 0, tzinfo=pytz.UTC))
self.assertEqual(results[0].datetime_tz_field, datetime.datetime(2020, 6, 11, 4, 0, 0, tzinfo=pytz.UTC))
self.assertEqual(results[0].datetime64_tz_field, datetime.datetime(2020, 6, 11, 4, 0, 0, tzinfo=pytz.UTC))
self.assertEqual(results[0].datetime_utc_field, datetime.datetime(2020, 6, 11, 4, 0, 0, tzinfo=pytz.UTC))
self.assertEqual(results[1].datetime_no_tz_field, datetime.datetime(2020, 6, 11, 4, 0, 0, tzinfo=pytz.UTC))
self.assertEqual(results[1].datetime_tz_field, datetime.datetime(2020, 6, 11, 4, 0, 0, tzinfo=pytz.UTC))
self.assertEqual(results[1].datetime64_tz_field, datetime.datetime(2020, 6, 11, 4, 0, 0, tzinfo=pytz.UTC))
self.assertEqual(results[1].datetime_utc_field, datetime.datetime(2020, 6, 11, 4, 0, 0, tzinfo=pytz.UTC))
self.assertEqual(results[0].datetime_no_tz_field.tzinfo.zone, self.database.server_timezone.zone)
self.assertEqual(results[0].datetime_tz_field.tzinfo.zone, pytz.timezone('Europe/Madrid').zone)
self.assertEqual(results[0].datetime64_tz_field.tzinfo.zone, pytz.timezone('Europe/Madrid').zone)
self.assertEqual(results[0].datetime_utc_field.tzinfo.zone, pytz.timezone('UTC').zone)
self.assertEqual(results[1].datetime_no_tz_field.tzinfo.zone, self.database.server_timezone.zone)
self.assertEqual(results[1].datetime_tz_field.tzinfo.zone, pytz.timezone('Europe/Madrid').zone)
self.assertEqual(results[1].datetime64_tz_field.tzinfo.zone, pytz.timezone('Europe/Madrid').zone)
self.assertEqual(results[1].datetime_utc_field.tzinfo.zone, pytz.timezone('UTC').zone)

View File

@ -1,5 +1,4 @@
# -*- coding: utf-8 -*-
from __future__ import unicode_literals
import unittest
from decimal import Decimal

131
tests/test_dictionaries.py Normal file
View File

@ -0,0 +1,131 @@
import unittest
import logging
from infi.clickhouse_orm import *
class DictionaryTestMixin:
def setUp(self):
self.database = Database('test-db', log_statements=True)
if self.database.server_version < (20, 1, 11, 73):
raise unittest.SkipTest('ClickHouse version too old')
self._create_dictionary()
def tearDown(self):
self.database.drop_database()
def _test_func(self, func, expected_value):
sql = 'SELECT %s AS value' % func.to_sql()
logging.info(sql)
result = list(self.database.select(sql))
logging.info('\t==> %s', result[0].value if result else '<empty>')
print('Comparing %s to %s' % (result[0].value, expected_value))
self.assertEqual(result[0].value, expected_value)
return result[0].value if result else None
class SimpleDictionaryTest(DictionaryTestMixin, unittest.TestCase):
def _create_dictionary(self):
# Create a table to be used as source for the dictionary
self.database.create_table(NumberName)
self.database.insert(
NumberName(number=i, name=name)
for i, name in enumerate('Zero One Two Three Four Five Six Seven Eight Nine Ten'.split())
)
# Create the dictionary
self.database.raw("""
CREATE DICTIONARY numbers_dict(
number UInt64,
name String DEFAULT '?'
)
PRIMARY KEY number
SOURCE(CLICKHOUSE(
HOST 'localhost' PORT 9000 USER 'default' PASSWORD '' DB 'test-db' TABLE 'numbername'
))
LIFETIME(100)
LAYOUT(HASHED());
""")
self.dict_name = 'test-db.numbers_dict'
def test_dictget(self):
self._test_func(F.dictGet(self.dict_name, 'name', F.toUInt64(3)), 'Three')
self._test_func(F.dictGet(self.dict_name, 'name', F.toUInt64(99)), '?')
def test_dictgetordefault(self):
self._test_func(F.dictGetOrDefault(self.dict_name, 'name', F.toUInt64(3), 'n/a'), 'Three')
self._test_func(F.dictGetOrDefault(self.dict_name, 'name', F.toUInt64(99), 'n/a'), 'n/a')
def test_dicthas(self):
self._test_func(F.dictHas(self.dict_name, F.toUInt64(3)), 1)
self._test_func(F.dictHas(self.dict_name, F.toUInt64(99)), 0)
class HierarchicalDictionaryTest(DictionaryTestMixin, unittest.TestCase):
def _create_dictionary(self):
# Create a table to be used as source for the dictionary
self.database.create_table(Region)
self.database.insert([
Region(region_id=1, parent_region=0, region_name='Russia'),
Region(region_id=2, parent_region=1, region_name='Moscow'),
Region(region_id=3, parent_region=2, region_name='Center'),
Region(region_id=4, parent_region=0, region_name='Great Britain'),
Region(region_id=5, parent_region=4, region_name='London'),
])
# Create the dictionary
self.database.raw("""
CREATE DICTIONARY regions_dict(
region_id UInt64,
parent_region UInt64 HIERARCHICAL,
region_name String DEFAULT '?'
)
PRIMARY KEY region_id
SOURCE(CLICKHOUSE(
HOST 'localhost' PORT 9000 USER 'default' PASSWORD '' DB 'test-db' TABLE 'region'
))
LIFETIME(100)
LAYOUT(HASHED());
""")
self.dict_name = 'test-db.regions_dict'
def test_dictget(self):
self._test_func(F.dictGet(self.dict_name, 'region_name', F.toUInt64(3)), 'Center')
self._test_func(F.dictGet(self.dict_name, 'parent_region', F.toUInt64(3)), 2)
self._test_func(F.dictGet(self.dict_name, 'region_name', F.toUInt64(99)), '?')
def test_dictgetordefault(self):
self._test_func(F.dictGetOrDefault(self.dict_name, 'region_name', F.toUInt64(3), 'n/a'), 'Center')
self._test_func(F.dictGetOrDefault(self.dict_name, 'region_name', F.toUInt64(99), 'n/a'), 'n/a')
def test_dicthas(self):
self._test_func(F.dictHas(self.dict_name, F.toUInt64(3)), 1)
self._test_func(F.dictHas(self.dict_name, F.toUInt64(99)), 0)
def test_dictgethierarchy(self):
self._test_func(F.dictGetHierarchy(self.dict_name, F.toUInt64(3)), [3, 2, 1])
self._test_func(F.dictGetHierarchy(self.dict_name, F.toUInt64(99)), [99])
def test_dictisin(self):
self._test_func(F.dictIsIn(self.dict_name, F.toUInt64(3), F.toUInt64(1)), 1)
self._test_func(F.dictIsIn(self.dict_name, F.toUInt64(3), F.toUInt64(4)), 0)
self._test_func(F.dictIsIn(self.dict_name, F.toUInt64(99), F.toUInt64(4)), 0)
class NumberName(Model):
''' A table to act as a source for the dictionary '''
number = UInt64Field()
name = StringField()
engine = Memory()
class Region(Model):
region_id = UInt64Field()
parent_region = UInt64Field()
region_name = StringField()
engine = Memory()

View File

@ -1,4 +1,3 @@
from __future__ import unicode_literals
import unittest
import datetime

View File

@ -1,4 +1,3 @@
from __future__ import unicode_literals
import unittest
from infi.clickhouse_orm.database import Database

View File

@ -1,5 +1,4 @@
# -*- coding: utf-8 -*-
from __future__ import unicode_literals
import unittest
from infi.clickhouse_orm.database import Database

View File

@ -34,6 +34,7 @@ class FuncsTestCase(TestCaseWithData):
result = list(self.database.select(sql))
logging.info('\t==> %s', result[0].value if result else '<empty>')
if expected_value != NO_VALUE:
print('Comparing %s to %s' % (result[0].value, expected_value))
self.assertEqual(result[0].value, expected_value)
return result[0].value if result else None
except ServerError as e:
@ -310,12 +311,13 @@ class FuncsTestCase(TestCaseWithData):
raise unittest.SkipTest('This test must run with UTC as the server timezone')
d = date(2018, 12, 31)
dt = datetime(2018, 12, 31, 11, 22, 33)
athens_tz = pytz.timezone('Europe/Athens')
self._test_func(F.toHour(dt), 11)
self._test_func(F.toStartOfDay(dt), datetime(2018, 12, 31, 0, 0, 0, tzinfo=pytz.utc))
self._test_func(F.toTime(dt, pytz.utc), datetime(1970, 1, 2, 11, 22, 33, tzinfo=pytz.utc))
self._test_func(F.toTime(dt, 'Europe/Athens'), datetime(1970, 1, 2, 13, 22, 33, tzinfo=pytz.utc))
self._test_func(F.toTime(dt, pytz.timezone('Europe/Athens')), datetime(1970, 1, 2, 13, 22, 33, tzinfo=pytz.utc))
self._test_func(F.toTimeZone(dt, 'Europe/Athens'), datetime(2018, 12, 31, 13, 22, 33, tzinfo=pytz.utc))
self._test_func(F.toTime(dt, 'Europe/Athens'), athens_tz.localize(datetime(1970, 1, 2, 13, 22, 33)))
self._test_func(F.toTime(dt, athens_tz), athens_tz.localize(datetime(1970, 1, 2, 13, 22, 33)))
self._test_func(F.toTimeZone(dt, 'Europe/Athens'), athens_tz.localize(datetime(2018, 12, 31, 13, 22, 33)))
self._test_func(F.now(), datetime.utcnow().replace(tzinfo=pytz.utc, microsecond=0)) # FIXME this may fail if the timing is just right
self._test_func(F.today(), datetime.utcnow().date())
self._test_func(F.yesterday(), datetime.utcnow().date() - timedelta(days=1))
@ -351,6 +353,7 @@ class FuncsTestCase(TestCaseWithData):
if self.database.server_timezone != pytz.utc:
raise unittest.SkipTest('This test must run with UTC as the server timezone')
self._test_func(F.toDateTime('2018-12-31 11:22:33'), datetime(2018, 12, 31, 11, 22, 33, tzinfo=pytz.utc))
self._test_func(F.toDateTime64('2018-12-31 11:22:33.001', 6), datetime(2018, 12, 31, 11, 22, 33, 1000, tzinfo=pytz.utc))
self._test_func(F.parseDateTimeBestEffort('31/12/2019 10:05AM'), datetime(2019, 12, 31, 10, 5, tzinfo=pytz.utc))
self._test_func(F.parseDateTimeBestEffortOrNull('31/12/2019 10:05AM'), datetime(2019, 12, 31, 10, 5, tzinfo=pytz.utc))
self._test_func(F.parseDateTimeBestEffortOrZero('31/12/2019 10:05AM'), datetime(2019, 12, 31, 10, 5, tzinfo=pytz.utc))

32
tests/test_indexes.py Normal file
View File

@ -0,0 +1,32 @@
import unittest
from infi.clickhouse_orm import *
class IndexesTest(unittest.TestCase):
def setUp(self):
self.database = Database('test-db', log_statements=True)
if self.database.server_version < (20, 1, 2, 4):
raise unittest.SkipTest('ClickHouse version too old')
def tearDown(self):
self.database.drop_database()
def test_all_index_types(self):
self.database.create_table(ModelWithIndexes)
class ModelWithIndexes(Model):
date = DateField()
f1 = Int32Field()
f2 = StringField()
i1 = Index(f1, type=Index.minmax(), granularity=1)
i2 = Index(f1, type=Index.set(1000), granularity=2)
i3 = Index(f2, type=Index.ngrambf_v1(3, 256, 2, 0), granularity=1)
i4 = Index(F.lower(f2), type=Index.tokenbf_v1(256, 2, 0), granularity=2)
i5 = Index((F.toQuarter(date), f2), type=Index.bloom_filter(), granularity=3)
engine = MergeTree('date', ('date',))

View File

@ -1,4 +1,3 @@
from __future__ import unicode_literals
import unittest
import datetime
import pytz

View File

@ -1,4 +1,3 @@
from __future__ import unicode_literals
import unittest
from ipaddress import IPv4Address, IPv6Address
from infi.clickhouse_orm.database import Database

View File

@ -1,4 +1,3 @@
from __future__ import unicode_literals, print_function
import unittest
import json

View File

@ -1,4 +1,3 @@
from __future__ import unicode_literals
import unittest
from datetime import date

View File

@ -1,8 +1,7 @@
from __future__ import unicode_literals
import unittest
from infi.clickhouse_orm.database import Database
from infi.clickhouse_orm.models import Model, BufferModel
from infi.clickhouse_orm.database import Database, ServerError
from infi.clickhouse_orm.models import Model, BufferModel, Constraint, Index
from infi.clickhouse_orm.fields import *
from infi.clickhouse_orm.engines import *
from infi.clickhouse_orm.migrations import MigrationHistory
@ -27,55 +26,58 @@ class MigrationsTestCase(unittest.TestCase):
def tearDown(self):
self.database.drop_database()
def tableExists(self, model_class):
def table_exists(self, model_class):
query = "EXISTS TABLE $db.`%s`" % model_class.table_name()
return next(self.database.select(query)).result == 1
def getTableFields(self, model_class):
def get_table_fields(self, model_class):
query = "DESC `%s`.`%s`" % (self.database.db_name, model_class.table_name())
return [(row.name, row.type) for row in self.database.select(query)]
def get_table_def(self, model_class):
return self.database.raw('SHOW CREATE TABLE $db.`%s`' % model_class.table_name())
def test_migrations(self):
# Creation and deletion of table
self.database.migrate('tests.sample_migrations', 1)
self.assertTrue(self.tableExists(Model1))
self.assertTrue(self.table_exists(Model1))
self.database.migrate('tests.sample_migrations', 2)
self.assertFalse(self.tableExists(Model1))
self.assertFalse(self.table_exists(Model1))
self.database.migrate('tests.sample_migrations', 3)
self.assertTrue(self.tableExists(Model1))
self.assertTrue(self.table_exists(Model1))
# Adding, removing and altering simple fields
self.assertEqual(self.getTableFields(Model1), [('date', 'Date'), ('f1', 'Int32'), ('f2', 'String')])
self.assertEqual(self.get_table_fields(Model1), [('date', 'Date'), ('f1', 'Int32'), ('f2', 'String')])
self.database.migrate('tests.sample_migrations', 4)
self.assertEqual(self.getTableFields(Model2), [('date', 'Date'), ('f1', 'Int32'), ('f3', 'Float32'), ('f2', 'String'), ('f4', 'String'), ('f5', 'Array(UInt64)')])
self.assertEqual(self.get_table_fields(Model2), [('date', 'Date'), ('f1', 'Int32'), ('f3', 'Float32'), ('f2', 'String'), ('f4', 'String'), ('f5', 'Array(UInt64)')])
self.database.migrate('tests.sample_migrations', 5)
self.assertEqual(self.getTableFields(Model3), [('date', 'Date'), ('f1', 'Int64'), ('f3', 'Float64'), ('f4', 'String')])
self.assertEqual(self.get_table_fields(Model3), [('date', 'Date'), ('f1', 'Int64'), ('f3', 'Float64'), ('f4', 'String')])
# Altering enum fields
self.database.migrate('tests.sample_migrations', 6)
self.assertTrue(self.tableExists(EnumModel1))
self.assertEqual(self.getTableFields(EnumModel1),
self.assertTrue(self.table_exists(EnumModel1))
self.assertEqual(self.get_table_fields(EnumModel1),
[('date', 'Date'), ('f1', "Enum8('dog' = 1, 'cat' = 2, 'cow' = 3)")])
self.database.migrate('tests.sample_migrations', 7)
self.assertTrue(self.tableExists(EnumModel1))
self.assertEqual(self.getTableFields(EnumModel2),
self.assertTrue(self.table_exists(EnumModel1))
self.assertEqual(self.get_table_fields(EnumModel2),
[('date', 'Date'), ('f1', "Enum16('dog' = 1, 'cat' = 2, 'horse' = 3, 'pig' = 4)")])
# Materialized fields and alias fields
self.database.migrate('tests.sample_migrations', 8)
self.assertTrue(self.tableExists(MaterializedModel))
self.assertEqual(self.getTableFields(MaterializedModel),
self.assertTrue(self.table_exists(MaterializedModel))
self.assertEqual(self.get_table_fields(MaterializedModel),
[('date_time', "DateTime"), ('date', 'Date')])
self.database.migrate('tests.sample_migrations', 9)
self.assertTrue(self.tableExists(AliasModel))
self.assertEqual(self.getTableFields(AliasModel),
self.assertTrue(self.table_exists(AliasModel))
self.assertEqual(self.get_table_fields(AliasModel),
[('date', 'Date'), ('date_alias', "Date")])
# Buffer models creation and alteration
self.database.migrate('tests.sample_migrations', 10)
self.assertTrue(self.tableExists(Model4))
self.assertTrue(self.tableExists(Model4Buffer))
self.assertEqual(self.getTableFields(Model4), [('date', 'Date'), ('f1', 'Int32'), ('f2', 'String')])
self.assertEqual(self.getTableFields(Model4Buffer), [('date', 'Date'), ('f1', 'Int32'), ('f2', 'String')])
self.assertTrue(self.table_exists(Model4))
self.assertTrue(self.table_exists(Model4Buffer))
self.assertEqual(self.get_table_fields(Model4), [('date', 'Date'), ('f1', 'Int32'), ('f2', 'String')])
self.assertEqual(self.get_table_fields(Model4Buffer), [('date', 'Date'), ('f1', 'Int32'), ('f2', 'String')])
self.database.migrate('tests.sample_migrations', 11)
self.assertEqual(self.getTableFields(Model4), [('date', 'Date'), ('f3', 'DateTime'), ('f2', 'String')])
self.assertEqual(self.getTableFields(Model4Buffer), [('date', 'Date'), ('f3', 'DateTime'), ('f2', 'String')])
self.assertEqual(self.get_table_fields(Model4), [('date', 'Date'), ('f3', 'DateTime'), ('f2', 'String')])
self.assertEqual(self.get_table_fields(Model4Buffer), [('date', 'Date'), ('f3', 'DateTime'), ('f2', 'String')])
self.database.migrate('tests.sample_migrations', 12)
self.assertEqual(self.database.count(Model3), 3)
@ -88,24 +90,54 @@ class MigrationsTestCase(unittest.TestCase):
self.assertListEqual(data, [1, 2, 3, 4])
self.database.migrate('tests.sample_migrations', 14)
self.assertTrue(self.tableExists(MaterializedModel1))
self.assertEqual(self.getTableFields(MaterializedModel1),
self.assertTrue(self.table_exists(MaterializedModel1))
self.assertEqual(self.get_table_fields(MaterializedModel1),
[('date_time', 'DateTime'), ('int_field', 'Int8'), ('date', 'Date'), ('int_field_plus_one', 'Int8')])
self.assertTrue(self.tableExists(AliasModel1))
self.assertEqual(self.getTableFields(AliasModel1),
self.assertTrue(self.table_exists(AliasModel1))
self.assertEqual(self.get_table_fields(AliasModel1),
[('date', 'Date'), ('int_field', 'Int8'), ('date_alias', 'Date'), ('int_field_plus_one', 'Int8')])
# Codecs and low cardinality
self.database.migrate('tests.sample_migrations', 15)
self.assertTrue(self.tableExists(Model4_compressed))
self.assertTrue(self.table_exists(Model4_compressed))
if self.database.has_low_cardinality_support:
self.assertEqual(self.getTableFields(Model2LowCardinality),
self.assertEqual(self.get_table_fields(Model2LowCardinality),
[('date', 'Date'), ('f1', 'LowCardinality(Int32)'), ('f3', 'LowCardinality(Float32)'),
('f2', 'LowCardinality(String)'), ('f4', 'LowCardinality(Nullable(String))'), ('f5', 'Array(LowCardinality(UInt64))')])
else:
logging.warning('No support for low cardinality')
self.assertEqual(self.getTableFields(Model2),
self.assertEqual(self.get_table_fields(Model2),
[('date', 'Date'), ('f1', 'Int32'), ('f3', 'Float32'), ('f2', 'String'), ('f4', 'Nullable(String)'),
('f5', 'Array(UInt64)')])
if self.database.server_version >= (19, 14, 3, 3):
# Creating constraints
self.database.migrate('tests.sample_migrations', 16)
self.assertTrue(self.table_exists(ModelWithConstraints))
self.database.insert([ModelWithConstraints(f1=101, f2='a')])
with self.assertRaises(ServerError):
self.database.insert([ModelWithConstraints(f1=99, f2='a')])
with self.assertRaises(ServerError):
self.database.insert([ModelWithConstraints(f1=101, f2='x')])
# Modifying constraints
self.database.migrate('tests.sample_migrations', 17)
self.database.insert([ModelWithConstraints(f1=99, f2='a')])
with self.assertRaises(ServerError):
self.database.insert([ModelWithConstraints(f1=101, f2='a')])
with self.assertRaises(ServerError):
self.database.insert([ModelWithConstraints(f1=99, f2='x')])
if self.database.server_version >= (20, 1, 2, 4):
# Creating indexes
self.database.migrate('tests.sample_migrations', 18)
self.assertTrue(self.table_exists(ModelWithIndex))
self.assertIn('INDEX index ', self.get_table_def(ModelWithIndex))
self.assertIn('INDEX another_index ', self.get_table_def(ModelWithIndex))
# Modifying indexes
self.database.migrate('tests.sample_migrations', 19)
self.assertNotIn('INDEX index ', self.get_table_def(ModelWithIndex))
self.assertIn('INDEX index2 ', self.get_table_def(ModelWithIndex))
self.assertIn('INDEX another_index ', self.get_table_def(ModelWithIndex))
# Several different models with the same table name, to simulate a table that changes over time
@ -294,3 +326,68 @@ class Model2LowCardinality(Model):
@classmethod
def table_name(cls):
return 'mig'
class ModelWithConstraints(Model):
date = DateField()
f1 = Int32Field()
f2 = StringField()
constraint = Constraint(f2.isIn(['a', 'b', 'c'])) # check reserved keyword as constraint name
f1_constraint = Constraint(f1 > 100)
engine = MergeTree('date', ('date',))
@classmethod
def table_name(cls):
return 'modelwithconstraints'
class ModelWithConstraints2(Model):
date = DateField()
f1 = Int32Field()
f2 = StringField()
constraint = Constraint(f2.isIn(['a', 'b', 'c']))
f1_constraint_new = Constraint(f1 < 100)
engine = MergeTree('date', ('date',))
@classmethod
def table_name(cls):
return 'modelwithconstraints'
class ModelWithIndex(Model):
date = DateField()
f1 = Int32Field()
f2 = StringField()
index = Index(f1, type=Index.minmax(), granularity=1)
another_index = Index(f2, type=Index.set(0), granularity=1)
engine = MergeTree('date', ('date',))
@classmethod
def table_name(cls):
return 'modelwithindex'
class ModelWithIndex2(Model):
date = DateField()
f1 = Int32Field()
f2 = StringField()
index2 = Index(f1, type=Index.bloom_filter(), granularity=2)
another_index = Index(f2, type=Index.set(0), granularity=1)
engine = MergeTree('date', ('date',))
@classmethod
def table_name(cls):
return 'modelwithindex'

View File

@ -1,4 +1,3 @@
from __future__ import unicode_literals
import unittest
import datetime
import pytz

87
tests/test_mutations.py Normal file
View File

@ -0,0 +1,87 @@
import unittest
from infi.clickhouse_orm import F
from .base_test_with_data import *
from time import sleep
class MutationsTestCase(TestCaseWithData):
def setUp(self):
super().setUp()
if self.database.server_version < (18,):
raise unittest.SkipTest('ClickHouse version too old')
self._insert_all()
def _wait_for_mutations(self):
sql = 'SELECT * FROM system.mutations WHERE is_done = 0'
while list(self.database.raw(sql)):
sleep(0.25)
def test_delete_all(self):
Person.objects_in(self.database).delete()
self._wait_for_mutations()
self.assertFalse(Person.objects_in(self.database))
def test_delete_with_where_cond(self):
cond = Person.first_name == 'Cassady'
self.assertTrue(Person.objects_in(self.database).filter(cond))
Person.objects_in(self.database).filter(cond).delete()
self._wait_for_mutations()
self.assertFalse(Person.objects_in(self.database).filter(cond))
self.assertTrue(Person.objects_in(self.database).exclude(cond))
def test_delete_with_prewhere_cond(self):
cond = F.toYear(Person.birthday) == 1977
self.assertTrue(Person.objects_in(self.database).filter(cond))
Person.objects_in(self.database).filter(cond, prewhere=True).delete()
self._wait_for_mutations()
self.assertFalse(Person.objects_in(self.database).filter(cond))
self.assertTrue(Person.objects_in(self.database).exclude(cond))
def test_update_all(self):
Person.objects_in(self.database).update(height=0)
self._wait_for_mutations()
for p in Person.objects_in(self.database): print(p.height)
self.assertFalse(Person.objects_in(self.database).exclude(height=0))
def test_update_with_where_cond(self):
cond = Person.first_name == 'Cassady'
Person.objects_in(self.database).filter(cond).update(height=0)
self._wait_for_mutations()
self.assertFalse(Person.objects_in(self.database).filter(cond).exclude(height=0))
def test_update_with_prewhere_cond(self):
cond = F.toYear(Person.birthday) == 1977
Person.objects_in(self.database).filter(cond, prewhere=True).update(height=0)
self._wait_for_mutations()
self.assertFalse(Person.objects_in(self.database).filter(cond).exclude(height=0))
def test_update_multiple_fields(self):
Person.objects_in(self.database).update(height=0, passport=None)
self._wait_for_mutations()
self.assertFalse(Person.objects_in(self.database).exclude(height=0))
self.assertFalse(Person.objects_in(self.database).exclude(passport=None))
def test_chained_update(self):
Person.objects_in(self.database).update(height=F.rand()).update(passport=99999)
self._wait_for_mutations()
self.assertFalse(Person.objects_in(self.database).exclude(passport=99999))
def test_invalid_state_for_mutations(self):
base_query = Person.objects_in(self.database)
queries = [
base_query[0:1],
base_query.limit_by(5, 'first_name'),
base_query.distinct(),
base_query.aggregate('first_name', count=F.count())
]
for query in queries:
print(query)
with self.assertRaises(AssertionError):
query.delete()
with self.assertRaises(AssertionError):
query.update(height=1.8)
def test_missing_fields_for_update(self):
with self.assertRaises(AssertionError):
Person.objects_in(self.database).update()

View File

@ -1,4 +1,3 @@
from __future__ import unicode_literals
import unittest
import pytz
@ -38,7 +37,7 @@ class NullableFieldsTest(unittest.TestCase):
if value == '\\N':
self.assertIsNone(dt)
else:
self.assertEqual(dt.tzinfo, pytz.utc)
self.assertTrue(dt.tzinfo)
# Verify that conversion to and from db string does not change value
dt2 = f.to_python(f.to_db_string(dt, quote=False), pytz.utc)
self.assertEqual(dt, dt2)

View File

@ -1,5 +1,4 @@
# -*- coding: utf-8 -*-
from __future__ import unicode_literals, print_function
import unittest
from infi.clickhouse_orm.database import Database
from infi.clickhouse_orm.query import Q
@ -287,7 +286,7 @@ class QuerySetTestCase(TestCaseWithData):
self._test_qs(qs[80:], 20)
def test_final(self):
# Final can be used with CollapsingMergeTree engine only
# Final can be used with CollapsingMergeTree/ReplacingMergeTree engines only
with self.assertRaises(TypeError):
Person.objects_in(self.database).final()

View File

@ -1,5 +1,4 @@
# -*- coding: utf-8 -*-
from __future__ import unicode_literals
from infi.clickhouse_orm.database import DatabaseException, ServerError
from .base_test_with_data import *

View File

@ -1,4 +1,3 @@
from __future__ import unicode_literals
import unittest
from infi.clickhouse_orm.database import ServerError

View File

@ -1,4 +1,3 @@
from __future__ import unicode_literals
import unittest
from infi.clickhouse_orm.fields import *
from datetime import date, datetime
@ -7,17 +6,21 @@ import pytz
class SimpleFieldsTest(unittest.TestCase):
def test_datetime_field(self):
f = DateTimeField()
epoch = datetime(1970, 1, 1, tzinfo=pytz.utc)
# Valid values
for value in (date(1970, 1, 1), datetime(1970, 1, 1), epoch,
dates = [
date(1970, 1, 1), datetime(1970, 1, 1), epoch,
epoch.astimezone(pytz.timezone('US/Eastern')), epoch.astimezone(pytz.timezone('Asia/Jerusalem')),
'1970-01-01 00:00:00', '1970-01-17 00:00:17', '0000-00-00 00:00:00', 0,
'2017-07-26T08:31:05', '2017-07-26T08:31:05Z', '2017-07-26 08:31',
'2017-07-26T13:31:05+05', '2017-07-26 13:31:05+0500'):
'2017-07-26T13:31:05+05', '2017-07-26 13:31:05+0500'
]
def test_datetime_field(self):
f = DateTimeField()
for value in self.dates:
dt = f.to_python(value, pytz.utc)
self.assertEqual(dt.tzinfo, pytz.utc)
self.assertTrue(dt.tzinfo)
# Verify that conversion to and from db string does not change value
dt2 = f.to_python(f.to_db_string(dt, quote=False), pytz.utc)
self.assertEqual(dt, dt2)
@ -27,6 +30,35 @@ class SimpleFieldsTest(unittest.TestCase):
with self.assertRaises(ValueError):
f.to_python(value, pytz.utc)
def test_datetime64_field(self):
f = DateTime64Field()
# Valid values
for value in self.dates + [
datetime(1970, 1, 1, microsecond=100000),
pytz.timezone('US/Eastern').localize(datetime(1970, 1, 1, microsecond=100000)),
'1970-01-01 00:00:00.1', '1970-01-17 00:00:17.1', '0000-00-00 00:00:00.1', 0.1,
'2017-07-26T08:31:05.1', '2017-07-26T08:31:05.1Z', '2017-07-26 08:31.1',
'2017-07-26T13:31:05.1+05', '2017-07-26 13:31:05.1+0500'
]:
dt = f.to_python(value, pytz.utc)
self.assertTrue(dt.tzinfo)
# Verify that conversion to and from db string does not change value
dt2 = f.to_python(f.to_db_string(dt, quote=False), pytz.utc)
self.assertEqual(dt, dt2)
# Invalid values
for value in ('nope', '21/7/1999',
'2017-01 15:06:00', '2017-01-01X15:06:00', '2017-13-01T15:06:00'):
with self.assertRaises(ValueError):
f.to_python(value, pytz.utc)
def test_datetime64_field_precision(self):
for precision in range(1, 7):
f = DateTime64Field(precision=precision, timezone=pytz.utc)
dt = f.to_python(datetime(2000, 1, 1, microsecond=123456), pytz.utc)
dt2 = f.to_python(f.to_db_string(dt, quote=False), pytz.utc)
m = round(123456, precision - 6) # round rightmost microsecond digits according to precision
self.assertEqual(dt2, dt.replace(microsecond=m))
def test_date_field(self):
f = DateField()
epoch = date(1970, 1, 1)

View File

@ -1,5 +1,3 @@
from __future__ import unicode_literals
import unittest
from datetime import date

View File

@ -1,4 +1,3 @@
from __future__ import unicode_literals
import unittest
from uuid import UUID
from infi.clickhouse_orm.database import Database