mirror of
https://github.com/carrotquest/django-clickhouse.git
synced 2024-11-22 00:56:37 +03:00
Added more docs
This commit is contained in:
parent
5cb43ca6cd
commit
7d2d94336c
|
@ -38,7 +38,7 @@ A database alias to use in [QuerySets](queries.md) if direct [using](routing.md#
|
||||||
|
|
||||||
### CLICKHOUSE_SYNC_STORAGE
|
### CLICKHOUSE_SYNC_STORAGE
|
||||||
Defaults to: `'django_clickhouse.storages.RedisStorage'`
|
Defaults to: `'django_clickhouse.storages.RedisStorage'`
|
||||||
An intermediate storage class to use. Can be a string or class. [More info about storages](storages.md).
|
An [intermediate storage](storages.md) class to use. Can be a string or class.
|
||||||
|
|
||||||
### CLICKHOUSE_REDIS_CONFIG
|
### CLICKHOUSE_REDIS_CONFIG
|
||||||
Default to: `None`
|
Default to: `None`
|
||||||
|
@ -57,11 +57,11 @@ CLICKHOUSE_REDIS_CONFIG = {
|
||||||
|
|
||||||
### CLICKHOUSE_SYNC_BATCH_SIZE
|
### CLICKHOUSE_SYNC_BATCH_SIZE
|
||||||
Defaults to: `10000`
|
Defaults to: `10000`
|
||||||
Maximum number of operations, fetched by sync process from intermediate storage per sync round.
|
Maximum number of operations, fetched by sync process from [intermediate storage](storages.md) per [sync](sync.md)) round.
|
||||||
|
|
||||||
### CLICKHOUSE_SYNC_DELAY
|
### CLICKHOUSE_SYNC_DELAY
|
||||||
Defaults to: `5`
|
Defaults to: `5`
|
||||||
A delay in seconds between two sync rounds start.
|
A delay in seconds between two [sync](synchronization.md) rounds start.
|
||||||
|
|
||||||
### CLICKHOUSE_MODELS_MODULE
|
### CLICKHOUSE_MODELS_MODULE
|
||||||
Defaults to: `'clickhouse_models'`
|
Defaults to: `'clickhouse_models'`
|
||||||
|
|
|
@ -22,6 +22,9 @@ secondary = connections['secondary']
|
||||||
db_link = connections['default']
|
db_link = connections['default']
|
||||||
```
|
```
|
||||||
|
|
||||||
|
You can also get database objects from [QuerySet](queries.md) and [ClickHouseModel](models.md) instances by calling `get_database(for_write: bool = False)` method.
|
||||||
|
This database may differ, depending on [routing](routing.md#router) you use.
|
||||||
|
|
||||||
## Database object
|
## Database object
|
||||||
Database class is based on [infi.clickhouse_orm Database object](https://github.com/Infinidat/infi.clickhouse_orm/blob/develop/docs/models_and_databases.md#models-and-databases),
|
Database class is based on [infi.clickhouse_orm Database object](https://github.com/Infinidat/infi.clickhouse_orm/blob/develop/docs/models_and_databases.md#models-and-databases),
|
||||||
but extends it with some extra attributes and methods:
|
but extends it with some extra attributes and methods:
|
||||||
|
@ -31,10 +34,4 @@ I expect this library [migration system](migrations.md) to be used.
|
||||||
Direct database migration will lead to migration information errors.
|
Direct database migration will lead to migration information errors.
|
||||||
|
|
||||||
### `insert_tuples` and `select_tuples` methods
|
### `insert_tuples` and `select_tuples` methods
|
||||||
[infi.clickhouse_orm](https://github.com/Infinidat/infi.clickhouse_orm) store data rows in Model objects.
|
Methods to work with [ClickHouseModel namedtuples](models.md#clickhousemodel-namedtuple-form).
|
||||||
It works well on hundreds of records.
|
|
||||||
But when you sync 100k records in a batch, initializing 100k model instances will be slow.
|
|
||||||
Too optimize this process `ClickHouseModel` class have `get_tuple_class()` method.
|
|
||||||
It generates a [namedtuple](https://docs.python.org/3/library/collections.html#collections.namedtuple) class,
|
|
||||||
with same data fields a model has.
|
|
||||||
Initializing such tuples takes much less time, then initializing Model objects.
|
|
||||||
|
|
|
@ -6,7 +6,7 @@
|
||||||
* [Requirements](basic_information.md#requirements)
|
* [Requirements](basic_information.md#requirements)
|
||||||
* [Installation](basic_information.md#installation)
|
* [Installation](basic_information.md#installation)
|
||||||
* [Design motivation](motivation.md)
|
* [Design motivation](motivation.md)
|
||||||
* Usage
|
* [Usage](overview.md)
|
||||||
* [Overview](overview.md)
|
* [Overview](overview.md)
|
||||||
* [Models](models.md)
|
* [Models](models.md)
|
||||||
* [DjangoModel](models.md#DjangoModel)
|
* [DjangoModel](models.md#DjangoModel)
|
||||||
|
|
|
@ -109,6 +109,15 @@ class MyMultiModel(ClickHouseMultiModel):
|
||||||
sub_models = [AgeData, HeightData]
|
sub_models = [AgeData, HeightData]
|
||||||
```
|
```
|
||||||
|
|
||||||
|
## ClickHouseModel namedtuple form
|
||||||
|
[infi.clickhouse_orm](https://github.com/Infinidat/infi.clickhouse_orm) stores data rows in special Model objects.
|
||||||
|
It works well on hundreds of records.
|
||||||
|
But when you sync 100k records in a batch, initializing 100k model instances will be slow.
|
||||||
|
Too optimize this process `ClickHouseModel` class have `get_tuple_class()` method.
|
||||||
|
It generates a [namedtuple](https://docs.python.org/3/library/collections.html#collections.namedtuple) class,
|
||||||
|
with same data fields a model has.
|
||||||
|
Initializing such tuples takes much less time, then initializing Model objects.
|
||||||
|
|
||||||
## Engines
|
## Engines
|
||||||
Engine is a way of storing, indexing, replicating and sorting data ClickHouse ([docs](https://clickhouse.yandex/docs/en/operations/table_engines/)).
|
Engine is a way of storing, indexing, replicating and sorting data ClickHouse ([docs](https://clickhouse.yandex/docs/en/operations/table_engines/)).
|
||||||
Engine system is based on [infi.clickhouse_orm engine system](https://github.com/Infinidat/infi.clickhouse_orm/blob/develop/docs/table_engines.md#table-engines).
|
Engine system is based on [infi.clickhouse_orm engine system](https://github.com/Infinidat/infi.clickhouse_orm/blob/develop/docs/table_engines.md#table-engines).
|
||||||
|
@ -120,3 +129,25 @@ Currently supported engines (with all infi functionality, [more info](https://gi
|
||||||
* `ReplacingMergeTree`
|
* `ReplacingMergeTree`
|
||||||
* `SummingMergeTree`
|
* `SummingMergeTree`
|
||||||
* `CollapsingMergeTree`
|
* `CollapsingMergeTree`
|
||||||
|
|
||||||
|
|
||||||
|
## Serializers
|
||||||
|
Serializer is a class which translates django model instances to [namedtuples, inserted into ClickHouse](#clickhousemodel-namedtuple-form).
|
||||||
|
`django_clickhouse.serializers.Django2ClickHouseModelSerializer` is used by default in all models.
|
||||||
|
All serializers must inherit this class.
|
||||||
|
|
||||||
|
Serializer must implement next interface:
|
||||||
|
```python
|
||||||
|
from django_clickhouse.serializers import Django2ClickHouseModelSerializer
|
||||||
|
from django.db.models import Model as DjangoModel
|
||||||
|
from typing import *
|
||||||
|
|
||||||
|
class CustomSerializer(Django2ClickHouseModelSerializer):
|
||||||
|
def __init__(self, model_cls: Type['ClickHouseModel'], fields: Optional[Iterable[str]] = None,
|
||||||
|
exclude_fields: Optional[Iterable[str]] = None, writable: bool = False,
|
||||||
|
defaults: Optional[dict] = None) -> None:
|
||||||
|
super().__init__(model_cls, fields=fields, exclude_fields=exclude_fields, writable=writable, defaults=defaults)
|
||||||
|
|
||||||
|
def serialize(self, obj: DjangoModel) -> NamedTuple:
|
||||||
|
pass
|
||||||
|
```
|
||||||
|
|
|
@ -11,8 +11,7 @@ You can set a common prefix for all keys in this library using [CLICKHOUSE_STATS
|
||||||
## Gauges
|
## Gauges
|
||||||
* `<prefix>.sync.<model_name>.queue`
|
* `<prefix>.sync.<model_name>.queue`
|
||||||
Number of elements in [intermediate storage](storages.md) queue waiting for import.
|
Number of elements in [intermediate storage](storages.md) queue waiting for import.
|
||||||
<!--- TODO Add link --->
|
Queue should not be big. It depends on [sync_delay](synchronization.md#configuration) configured and time for syncing single batch.
|
||||||
Queue should not be big. It depends on [sync_delay]() configured and time for syncing single batch.
|
|
||||||
It is a good parameter to watch and alert on.
|
It is a good parameter to watch and alert on.
|
||||||
|
|
||||||
## Timers
|
## Timers
|
||||||
|
|
|
@ -76,7 +76,6 @@ from my_app.models import User
|
||||||
|
|
||||||
class ClickHouseUser(ClickHouseModel):
|
class ClickHouseUser(ClickHouseModel):
|
||||||
django_model = User
|
django_model = User
|
||||||
sync_delay = 5
|
|
||||||
|
|
||||||
id = fields.UInt32Field()
|
id = fields.UInt32Field()
|
||||||
first_name = fields.StringField()
|
first_name = fields.StringField()
|
||||||
|
|
|
@ -1,3 +1,46 @@
|
||||||
# Sync performance
|
# Sync performance
|
||||||
|
Every real life system may have its own performance problems.
|
||||||
|
They depend on:
|
||||||
|
* You ClickHouse servers configuration
|
||||||
|
* Number of ClickHouse instances in your cluster
|
||||||
|
* Your data formats
|
||||||
|
* Import speed
|
||||||
|
* Network
|
||||||
|
* etc
|
||||||
|
|
||||||
TODO
|
I recommend to use [monitoring](monitoring.md) in order to understand where is the bottle neck and act accordingly.
|
||||||
|
|
||||||
|
This chapter gives a list of known problems which can slow down your import.
|
||||||
|
|
||||||
|
## ClickHouse tuning
|
||||||
|
Read this [doc](https://clickhouse.tech/docs/en/introduction/performance/#performance-when-inserting-data)
|
||||||
|
and tune it both for read and write.
|
||||||
|
|
||||||
|
## ClickHouse cluster
|
||||||
|
As ClickHouse is a [multimaster database](https://clickhouse.tech/docs/en/introduction/distinctive_features/#data-replication-and-data-integrity-support),
|
||||||
|
you can import and read from any node when you have a cluster.
|
||||||
|
In order to read and import to multiple nodes you can use [CHProxy](https://github.com/Vertamedia/chproxy)
|
||||||
|
or add multiple databases to [routing configuration](routing.md#clickhousemodel-routing-attributes).
|
||||||
|
|
||||||
|
## CollapsingMergeTree engine and previous versions
|
||||||
|
In order to reduce number of stored data in [intermediate storage](storages.md),
|
||||||
|
this library doesn't store old versions of data on update or delete.
|
||||||
|
Another point is that getting previous data versions from relational storages is a hard operation.
|
||||||
|
Engines like `CollapsingMergeTree` get old versions from ClickHouse:
|
||||||
|
1. Using `version_col` if it is set in engine's parameters.
|
||||||
|
This is a special field which stores incremental row versions and is filled by the library.
|
||||||
|
It should be of any unsigned integer type (depending on how many row versions you may have).
|
||||||
|
2. Using `FINAL` query modification.
|
||||||
|
This way is much more slow, but doesn't require additional column.
|
||||||
|
|
||||||
|
## Know your data
|
||||||
|
In common case library user uses python types to form ClickHouse data.
|
||||||
|
Library is responsible for converting this data into format ClickHouse expects to receive.
|
||||||
|
This leads to great number of convert operations when you import data in big batches.
|
||||||
|
In order to reduce this time, you can:
|
||||||
|
* Set `MyClickHouseModel.sync_formatted_tuples` to True
|
||||||
|
* Override `MyClickHouseModel.get_insert_batch(, import_objects: Iterable[DjangoModel])` method:
|
||||||
|
It should get `cls.get_tuple_class()` and yield (it is a [generator](https://wiki.python.org/moin/Generators))
|
||||||
|
so it generates tuples of string values, already prepared to insert into ClickHouse.
|
||||||
|
**Important note**: `ClickHouseModel.get_insert_batch(...)` can perform additional functionality depending on model [engine](models.md#engines).
|
||||||
|
Be careful.
|
||||||
|
|
|
@ -1,3 +1,105 @@
|
||||||
# Synchronization
|
# Synchronization
|
||||||
|
|
||||||
TODO
|
## Design motivation
|
||||||
|
Read [here](motivation.md#sync-over-intermediate-storage).
|
||||||
|
|
||||||
|
|
||||||
|
## Algorithm
|
||||||
|
<!--- ![General scheme](https://octodex.github.com/images/yaktocat.png) --->
|
||||||
|
1. [Celery beat](https://docs.celeryproject.org/en/latest/userguide/periodic-tasks.html) schedules `django_clickhouse.tasks.clickhouse_auto_sync` task every second or near.
|
||||||
|
2. [Celery workers](https://docs.celeryproject.org/en/latest/userguide/workers.html) execute `clickhouse_auto_sync`.
|
||||||
|
It searches for `ClickHouseModel` subclasses which need sync (if `Model.need_sync()` method returns `True`).
|
||||||
|
2. `django_clickhouse.tasks.sync_clickhouse_model` task is scheduled for each `ClickHouseModel` which needs sync.
|
||||||
|
3. `sync_clickhouse_model` saves sync start time in [storage](storages.md) and calls `ClickHouseModel.sync_batch_from_storage()` method.
|
||||||
|
4. `ClickHouseModel.sync_batch_from_storage()`:
|
||||||
|
* Gets [storage](storages.md) model works with using `ClickHouseModel.get_storage()` method
|
||||||
|
* Calls `Storage.pre_sync(import_key)` for model [storage](storages.md).
|
||||||
|
This may be used to prevent parallel execution with locks or some other operations.
|
||||||
|
* Gets a list of operations to sync from [storage](storages.md).
|
||||||
|
* Fetches objects from relational database calling `ClickHouseModel.get_sync_objects(operations)` method.
|
||||||
|
* Forms a batch of tuples to insert into ClickHouse using `ClickHouseModel.get_insert_batch(import_objects)` method.
|
||||||
|
* Inserts batch of tuples into ClickHouse using `ClickHouseModel.insert_batch(batch)` method.
|
||||||
|
* Calls `Storage.post_sync(import_key)` method to clean up storage after syncing batch.
|
||||||
|
This method also removes synced operations from storage.
|
||||||
|
* If some exception occurred during execution, `Storage.post_sybc_failed(import_key)` method is called.
|
||||||
|
Note, that process can be killed without exception, for instance by OOM killer.
|
||||||
|
And this method will not be called.
|
||||||
|
|
||||||
|
|
||||||
|
## Configuration
|
||||||
|
Sync configuration can be set globally using django settings.py parameters or redeclared for each `ClickHouseModel` class.
|
||||||
|
`ClickHouseModel` configuration is prior to settings configuration.
|
||||||
|
|
||||||
|
### Settings configuration
|
||||||
|
* [CLICKHOUSE_CELERY_QUEUE](configuration.md#clickhouse_celery_queue)
|
||||||
|
Defaults to: `'celery'`
|
||||||
|
A name of a queue, used by celery to plan library sync tasks.
|
||||||
|
|
||||||
|
* [CLICKHOUSE_SYNC_STORAGE](configuration.md#clickhouse_sync_storage)
|
||||||
|
Defaults to: `'django_clickhouse.storages.RedisStorage'`
|
||||||
|
An [intermediate storage](storages.md) class to use. Can be a string or class.
|
||||||
|
|
||||||
|
* [CLICKHOUSE_SYNC_BATCH_SIZE](configuration.md#clickhouse_sync_storage)
|
||||||
|
Defaults to: `10000`
|
||||||
|
Maximum number of operations, fetched by sync process from [intermediate storage](storages.md) per sync round.
|
||||||
|
|
||||||
|
* [CLICKHOUSE_SYNC_DELAY](configuration.md#clickhouse_sync_storage)
|
||||||
|
Defaults to: `5`
|
||||||
|
A delay in seconds between two sync rounds start.
|
||||||
|
|
||||||
|
### ClickHouseModel configuration
|
||||||
|
Each `ClickHouseModel` subclass can define sync arguments and methods:
|
||||||
|
* `django_model: django.db.models.Model`
|
||||||
|
Required.
|
||||||
|
Django model this ClickHouseModel class is synchronized with.
|
||||||
|
|
||||||
|
* `django_model_serializer: django.db.models.Model`
|
||||||
|
Defaults to: `django_clickhouse.serializers.Django2ClickHouseModelSerializer`
|
||||||
|
[Serializer class](models.md#serializers) to convert DjangoModel to ClickHouseModel.
|
||||||
|
|
||||||
|
* `sync_enabled: bool`
|
||||||
|
Defaults to: `False`.
|
||||||
|
Is sync for this model enabled?
|
||||||
|
|
||||||
|
* `sync_batch_size: int`
|
||||||
|
Defaults to: [CLICKHOUSE_SYNC_BATCH_SIZE](configuration.md#clickhouse_sync_storage)
|
||||||
|
Maximum number of operations, fetched by sync process from [storage](storages.md) per sync round.
|
||||||
|
|
||||||
|
* `sync_delay: float`
|
||||||
|
Defaults to: [CLICKHOUSE_SYNC_DELAY](configuration.md#clickhouse_sync_storage)
|
||||||
|
A delay in seconds between two sync rounds start.
|
||||||
|
|
||||||
|
* `sync_storage: Union[str, Storage]`
|
||||||
|
Defaults to: [CLICKHOUSE_SYNC_STORAGE](configuration.md#clickhouse_sync_storage)
|
||||||
|
An [intermediate storage](storages.md) class to use. Can be a string or class.
|
||||||
|
|
||||||
|
Example:
|
||||||
|
```python
|
||||||
|
from django_clickhouse.clickhouse_models import ClickHouseModel
|
||||||
|
from django_clickhouse.engines import ReplacingMergeTree
|
||||||
|
from infi.clickhouse_orm import fields
|
||||||
|
from my_app.models import User
|
||||||
|
|
||||||
|
class ClickHouseUser(ClickHouseModel):
|
||||||
|
django_model = User
|
||||||
|
sync_enabled = True
|
||||||
|
sync_delay = 5
|
||||||
|
sync_batch_size = 1000
|
||||||
|
|
||||||
|
id = fields.UInt32Field()
|
||||||
|
first_name = fields.StringField()
|
||||||
|
birthday = fields.DateField()
|
||||||
|
visits = fields.UInt32Field(default=0)
|
||||||
|
|
||||||
|
engine = ReplacingMergeTree('birthday', ('birthday',))
|
||||||
|
```
|
||||||
|
|
||||||
|
|
||||||
|
## Fail resistance
|
||||||
|
Fail resistance is based on several points:
|
||||||
|
1. [Storage](storages.md) should not loose data in any case. It's not this library goal to keep it stable.
|
||||||
|
2. Data is removed from [storage](storages.md) only if import succeeds. Otherwise import attempt is repeated.
|
||||||
|
3. It's recommended to use ReplacingMergeTree or CollapsingMergeTree [engines](models.md#engines)
|
||||||
|
instead of simple MergeTree, so it removes duplicates if batch is imported twice.
|
||||||
|
4. Each `ClickHouseModel` is synced in separate process.
|
||||||
|
If one model fails, it should not affect other models.
|
||||||
|
|
2
setup.py
2
setup.py
|
@ -13,7 +13,7 @@ with open('requirements.txt') as f:
|
||||||
|
|
||||||
setup(
|
setup(
|
||||||
name='django-clickhouse',
|
name='django-clickhouse',
|
||||||
version='0.0.1',
|
version='1.0.0',
|
||||||
packages=['django_clickhouse'],
|
packages=['django_clickhouse'],
|
||||||
package_dir={'': 'src'},
|
package_dir={'': 'src'},
|
||||||
url='https://github.com/carrotquest/django-clickhouse',
|
url='https://github.com/carrotquest/django-clickhouse',
|
||||||
|
|
|
@ -1,3 +1,6 @@
|
||||||
|
import sys
|
||||||
|
from unittest import skipIf
|
||||||
|
|
||||||
from django.test import TestCase
|
from django.test import TestCase
|
||||||
|
|
||||||
from django_clickhouse.compatibility import namedtuple
|
from django_clickhouse.compatibility import namedtuple
|
||||||
|
@ -10,12 +13,16 @@ class NamedTupleTest(TestCase):
|
||||||
self.assertTupleEqual((1, 2, 4), tuple(TestTuple(1, 2, 4)))
|
self.assertTupleEqual((1, 2, 4), tuple(TestTuple(1, 2, 4)))
|
||||||
self.assertTupleEqual((1, 2, 4), tuple(TestTuple(a=1, b=2, c=4)))
|
self.assertTupleEqual((1, 2, 4), tuple(TestTuple(a=1, b=2, c=4)))
|
||||||
|
|
||||||
def test_exceptions(self):
|
@skipIf(sys.version_info < (3, 7),
|
||||||
|
"On python < 3.7 this error is not raised, as not given defaults are filled by None")
|
||||||
|
def test_no_required_value(self):
|
||||||
TestTuple = namedtuple('TestTuple', ('a', 'b', 'c'), defaults=[3])
|
TestTuple = namedtuple('TestTuple', ('a', 'b', 'c'), defaults=[3])
|
||||||
|
|
||||||
# BUG On python < 3.7 this error is not raised, as not given defaults are filled by None
|
with self.assertRaises(TypeError):
|
||||||
# with self.assertRaises(TypeError):
|
TestTuple(b=1, c=4)
|
||||||
# TestTuple(b=1, c=4)
|
|
||||||
|
def test_duplicate_value(self):
|
||||||
|
TestTuple = namedtuple('TestTuple', ('a', 'b', 'c'), defaults=[3])
|
||||||
|
|
||||||
with self.assertRaises(TypeError):
|
with self.assertRaises(TypeError):
|
||||||
TestTuple(1, 2, 3, c=4)
|
TestTuple(1, 2, 3, c=4)
|
||||||
|
|
Loading…
Reference in New Issue
Block a user