From 7d2d94336c45564cfaf24d96a9dc7b85008ac5fe Mon Sep 17 00:00:00 2001 From: M1ha Date: Fri, 7 Feb 2020 13:05:19 +0500 Subject: [PATCH] Added more docs --- docs/configuration.md | 6 +-- docs/databases.md | 11 ++-- docs/index.md | 2 +- docs/models.md | 31 +++++++++++ docs/monitoring.md | 3 +- docs/overview.md | 1 - docs/performance.md | 45 +++++++++++++++- docs/synchronization.md | 104 +++++++++++++++++++++++++++++++++++- setup.py | 2 +- tests/test_compatibility.py | 15 ++++-- 10 files changed, 199 insertions(+), 21 deletions(-) diff --git a/docs/configuration.md b/docs/configuration.md index 904f764..78c9487 100644 --- a/docs/configuration.md +++ b/docs/configuration.md @@ -38,7 +38,7 @@ A database alias to use in [QuerySets](queries.md) if direct [using](routing.md# ### CLICKHOUSE_SYNC_STORAGE Defaults to: `'django_clickhouse.storages.RedisStorage'` -An intermediate storage class to use. Can be a string or class. [More info about storages](storages.md). +An [intermediate storage](storages.md) class to use. Can be a string or class. ### CLICKHOUSE_REDIS_CONFIG Default to: `None` @@ -57,11 +57,11 @@ CLICKHOUSE_REDIS_CONFIG = { ### CLICKHOUSE_SYNC_BATCH_SIZE Defaults to: `10000` -Maximum number of operations, fetched by sync process from intermediate storage per sync round. +Maximum number of operations, fetched by sync process from [intermediate storage](storages.md) per [sync](sync.md)) round. ### CLICKHOUSE_SYNC_DELAY Defaults to: `5` -A delay in seconds between two sync rounds start. +A delay in seconds between two [sync](synchronization.md) rounds start. ### CLICKHOUSE_MODELS_MODULE Defaults to: `'clickhouse_models'` diff --git a/docs/databases.md b/docs/databases.md index c7ec87c..dc1109d 100644 --- a/docs/databases.md +++ b/docs/databases.md @@ -22,6 +22,9 @@ secondary = connections['secondary'] db_link = connections['default'] ``` +You can also get database objects from [QuerySet](queries.md) and [ClickHouseModel](models.md) instances by calling `get_database(for_write: bool = False)` method. +This database may differ, depending on [routing](routing.md#router) you use. + ## Database object Database class is based on [infi.clickhouse_orm Database object](https://github.com/Infinidat/infi.clickhouse_orm/blob/develop/docs/models_and_databases.md#models-and-databases), but extends it with some extra attributes and methods: @@ -31,10 +34,4 @@ I expect this library [migration system](migrations.md) to be used. Direct database migration will lead to migration information errors. ### `insert_tuples` and `select_tuples` methods -[infi.clickhouse_orm](https://github.com/Infinidat/infi.clickhouse_orm) store data rows in Model objects. -It works well on hundreds of records. -But when you sync 100k records in a batch, initializing 100k model instances will be slow. -Too optimize this process `ClickHouseModel` class have `get_tuple_class()` method. -It generates a [namedtuple](https://docs.python.org/3/library/collections.html#collections.namedtuple) class, -with same data fields a model has. -Initializing such tuples takes much less time, then initializing Model objects. +Methods to work with [ClickHouseModel namedtuples](models.md#clickhousemodel-namedtuple-form). diff --git a/docs/index.md b/docs/index.md index 5c364a7..8aedf21 100644 --- a/docs/index.md +++ b/docs/index.md @@ -6,7 +6,7 @@ * [Requirements](basic_information.md#requirements) * [Installation](basic_information.md#installation) * [Design motivation](motivation.md) -* Usage +* [Usage](overview.md) * [Overview](overview.md) * [Models](models.md) * [DjangoModel](models.md#DjangoModel) diff --git a/docs/models.md b/docs/models.md index 86d6415..95e4435 100644 --- a/docs/models.md +++ b/docs/models.md @@ -109,6 +109,15 @@ class MyMultiModel(ClickHouseMultiModel): sub_models = [AgeData, HeightData] ``` +## ClickHouseModel namedtuple form +[infi.clickhouse_orm](https://github.com/Infinidat/infi.clickhouse_orm) stores data rows in special Model objects. +It works well on hundreds of records. +But when you sync 100k records in a batch, initializing 100k model instances will be slow. +Too optimize this process `ClickHouseModel` class have `get_tuple_class()` method. +It generates a [namedtuple](https://docs.python.org/3/library/collections.html#collections.namedtuple) class, +with same data fields a model has. +Initializing such tuples takes much less time, then initializing Model objects. + ## Engines Engine is a way of storing, indexing, replicating and sorting data ClickHouse ([docs](https://clickhouse.yandex/docs/en/operations/table_engines/)). Engine system is based on [infi.clickhouse_orm engine system](https://github.com/Infinidat/infi.clickhouse_orm/blob/develop/docs/table_engines.md#table-engines). @@ -120,3 +129,25 @@ Currently supported engines (with all infi functionality, [more info](https://gi * `ReplacingMergeTree` * `SummingMergeTree` * `CollapsingMergeTree` + + +## Serializers +Serializer is a class which translates django model instances to [namedtuples, inserted into ClickHouse](#clickhousemodel-namedtuple-form). +`django_clickhouse.serializers.Django2ClickHouseModelSerializer` is used by default in all models. + All serializers must inherit this class. + +Serializer must implement next interface: +```python +from django_clickhouse.serializers import Django2ClickHouseModelSerializer +from django.db.models import Model as DjangoModel +from typing import * + +class CustomSerializer(Django2ClickHouseModelSerializer): + def __init__(self, model_cls: Type['ClickHouseModel'], fields: Optional[Iterable[str]] = None, + exclude_fields: Optional[Iterable[str]] = None, writable: bool = False, + defaults: Optional[dict] = None) -> None: + super().__init__(model_cls, fields=fields, exclude_fields=exclude_fields, writable=writable, defaults=defaults) + + def serialize(self, obj: DjangoModel) -> NamedTuple: + pass +``` diff --git a/docs/monitoring.md b/docs/monitoring.md index fadac7a..8567813 100644 --- a/docs/monitoring.md +++ b/docs/monitoring.md @@ -11,8 +11,7 @@ You can set a common prefix for all keys in this library using [CLICKHOUSE_STATS ## Gauges * `.sync..queue` Number of elements in [intermediate storage](storages.md) queue waiting for import. - - Queue should not be big. It depends on [sync_delay]() configured and time for syncing single batch. + Queue should not be big. It depends on [sync_delay](synchronization.md#configuration) configured and time for syncing single batch. It is a good parameter to watch and alert on. ## Timers diff --git a/docs/overview.md b/docs/overview.md index 8ea3f3f..2054067 100644 --- a/docs/overview.md +++ b/docs/overview.md @@ -76,7 +76,6 @@ from my_app.models import User class ClickHouseUser(ClickHouseModel): django_model = User - sync_delay = 5 id = fields.UInt32Field() first_name = fields.StringField() diff --git a/docs/performance.md b/docs/performance.md index ba33ada..1efa1aa 100644 --- a/docs/performance.md +++ b/docs/performance.md @@ -1,3 +1,46 @@ # Sync performance +Every real life system may have its own performance problems. +They depend on: +* You ClickHouse servers configuration +* Number of ClickHouse instances in your cluster +* Your data formats +* Import speed +* Network +* etc -TODO \ No newline at end of file +I recommend to use [monitoring](monitoring.md) in order to understand where is the bottle neck and act accordingly. + +This chapter gives a list of known problems which can slow down your import. + +## ClickHouse tuning +Read this [doc](https://clickhouse.tech/docs/en/introduction/performance/#performance-when-inserting-data) + and tune it both for read and write. + +## ClickHouse cluster +As ClickHouse is a [multimaster database](https://clickhouse.tech/docs/en/introduction/distinctive_features/#data-replication-and-data-integrity-support), + you can import and read from any node when you have a cluster. +In order to read and import to multiple nodes you can use [CHProxy](https://github.com/Vertamedia/chproxy) +or add multiple databases to [routing configuration](routing.md#clickhousemodel-routing-attributes). + +## CollapsingMergeTree engine and previous versions +In order to reduce number of stored data in [intermediate storage](storages.md), + this library doesn't store old versions of data on update or delete. + Another point is that getting previous data versions from relational storages is a hard operation. +Engines like `CollapsingMergeTree` get old versions from ClickHouse: +1. Using `version_col` if it is set in engine's parameters. + This is a special field which stores incremental row versions and is filled by the library. + It should be of any unsigned integer type (depending on how many row versions you may have). +2. Using `FINAL` query modification. + This way is much more slow, but doesn't require additional column. + +## Know your data +In common case library user uses python types to form ClickHouse data. +Library is responsible for converting this data into format ClickHouse expects to receive. +This leads to great number of convert operations when you import data in big batches. +In order to reduce this time, you can: +* Set `MyClickHouseModel.sync_formatted_tuples` to True +* Override `MyClickHouseModel.get_insert_batch(, import_objects: Iterable[DjangoModel])` method: + It should get `cls.get_tuple_class()` and yield (it is a [generator](https://wiki.python.org/moin/Generators)) + so it generates tuples of string values, already prepared to insert into ClickHouse. + **Important note**: `ClickHouseModel.get_insert_batch(...)` can perform additional functionality depending on model [engine](models.md#engines). + Be careful. diff --git a/docs/synchronization.md b/docs/synchronization.md index 3acc749..d147232 100644 --- a/docs/synchronization.md +++ b/docs/synchronization.md @@ -1,3 +1,105 @@ # Synchronization -TODO \ No newline at end of file +## Design motivation +Read [here](motivation.md#sync-over-intermediate-storage). + + +## Algorithm + +1. [Celery beat](https://docs.celeryproject.org/en/latest/userguide/periodic-tasks.html) schedules `django_clickhouse.tasks.clickhouse_auto_sync` task every second or near. +2. [Celery workers](https://docs.celeryproject.org/en/latest/userguide/workers.html) execute `clickhouse_auto_sync`. + It searches for `ClickHouseModel` subclasses which need sync (if `Model.need_sync()` method returns `True`). +2. `django_clickhouse.tasks.sync_clickhouse_model` task is scheduled for each `ClickHouseModel` which needs sync. +3. `sync_clickhouse_model` saves sync start time in [storage](storages.md) and calls `ClickHouseModel.sync_batch_from_storage()` method. +4. `ClickHouseModel.sync_batch_from_storage()`: + * Gets [storage](storages.md) model works with using `ClickHouseModel.get_storage()` method + * Calls `Storage.pre_sync(import_key)` for model [storage](storages.md). + This may be used to prevent parallel execution with locks or some other operations. + * Gets a list of operations to sync from [storage](storages.md). + * Fetches objects from relational database calling `ClickHouseModel.get_sync_objects(operations)` method. + * Forms a batch of tuples to insert into ClickHouse using `ClickHouseModel.get_insert_batch(import_objects)` method. + * Inserts batch of tuples into ClickHouse using `ClickHouseModel.insert_batch(batch)` method. + * Calls `Storage.post_sync(import_key)` method to clean up storage after syncing batch. + This method also removes synced operations from storage. + * If some exception occurred during execution, `Storage.post_sybc_failed(import_key)` method is called. + Note, that process can be killed without exception, for instance by OOM killer. + And this method will not be called. + + +## Configuration +Sync configuration can be set globally using django settings.py parameters or redeclared for each `ClickHouseModel` class. +`ClickHouseModel` configuration is prior to settings configuration. + +### Settings configuration +* [CLICKHOUSE_CELERY_QUEUE](configuration.md#clickhouse_celery_queue) +Defaults to: `'celery'` +A name of a queue, used by celery to plan library sync tasks. + +* [CLICKHOUSE_SYNC_STORAGE](configuration.md#clickhouse_sync_storage) +Defaults to: `'django_clickhouse.storages.RedisStorage'` +An [intermediate storage](storages.md) class to use. Can be a string or class. + +* [CLICKHOUSE_SYNC_BATCH_SIZE](configuration.md#clickhouse_sync_storage) +Defaults to: `10000` +Maximum number of operations, fetched by sync process from [intermediate storage](storages.md) per sync round. + +* [CLICKHOUSE_SYNC_DELAY](configuration.md#clickhouse_sync_storage) +Defaults to: `5` +A delay in seconds between two sync rounds start. + +### ClickHouseModel configuration +Each `ClickHouseModel` subclass can define sync arguments and methods: +* `django_model: django.db.models.Model` +Required. +Django model this ClickHouseModel class is synchronized with. + +* `django_model_serializer: django.db.models.Model` +Defaults to: `django_clickhouse.serializers.Django2ClickHouseModelSerializer` +[Serializer class](models.md#serializers) to convert DjangoModel to ClickHouseModel. + +* `sync_enabled: bool` +Defaults to: `False`. +Is sync for this model enabled? + +* `sync_batch_size: int` +Defaults to: [CLICKHOUSE_SYNC_BATCH_SIZE](configuration.md#clickhouse_sync_storage) +Maximum number of operations, fetched by sync process from [storage](storages.md) per sync round. + +* `sync_delay: float` +Defaults to: [CLICKHOUSE_SYNC_DELAY](configuration.md#clickhouse_sync_storage) +A delay in seconds between two sync rounds start. + +* `sync_storage: Union[str, Storage]` +Defaults to: [CLICKHOUSE_SYNC_STORAGE](configuration.md#clickhouse_sync_storage) +An [intermediate storage](storages.md) class to use. Can be a string or class. + +Example: +```python +from django_clickhouse.clickhouse_models import ClickHouseModel +from django_clickhouse.engines import ReplacingMergeTree +from infi.clickhouse_orm import fields +from my_app.models import User + +class ClickHouseUser(ClickHouseModel): + django_model = User + sync_enabled = True + sync_delay = 5 + sync_batch_size = 1000 + + id = fields.UInt32Field() + first_name = fields.StringField() + birthday = fields.DateField() + visits = fields.UInt32Field(default=0) + + engine = ReplacingMergeTree('birthday', ('birthday',)) +``` + + +## Fail resistance +Fail resistance is based on several points: +1. [Storage](storages.md) should not loose data in any case. It's not this library goal to keep it stable. +2. Data is removed from [storage](storages.md) only if import succeeds. Otherwise import attempt is repeated. +3. It's recommended to use ReplacingMergeTree or CollapsingMergeTree [engines](models.md#engines) + instead of simple MergeTree, so it removes duplicates if batch is imported twice. +4. Each `ClickHouseModel` is synced in separate process. + If one model fails, it should not affect other models. diff --git a/setup.py b/setup.py index c20a379..102de34 100644 --- a/setup.py +++ b/setup.py @@ -13,7 +13,7 @@ with open('requirements.txt') as f: setup( name='django-clickhouse', - version='0.0.1', + version='1.0.0', packages=['django_clickhouse'], package_dir={'': 'src'}, url='https://github.com/carrotquest/django-clickhouse', diff --git a/tests/test_compatibility.py b/tests/test_compatibility.py index 0c0cd30..c369277 100644 --- a/tests/test_compatibility.py +++ b/tests/test_compatibility.py @@ -1,3 +1,6 @@ +import sys +from unittest import skipIf + from django.test import TestCase from django_clickhouse.compatibility import namedtuple @@ -10,12 +13,16 @@ class NamedTupleTest(TestCase): self.assertTupleEqual((1, 2, 4), tuple(TestTuple(1, 2, 4))) self.assertTupleEqual((1, 2, 4), tuple(TestTuple(a=1, b=2, c=4))) - def test_exceptions(self): + @skipIf(sys.version_info < (3, 7), + "On python < 3.7 this error is not raised, as not given defaults are filled by None") + def test_no_required_value(self): TestTuple = namedtuple('TestTuple', ('a', 'b', 'c'), defaults=[3]) - # BUG On python < 3.7 this error is not raised, as not given defaults are filled by None - # with self.assertRaises(TypeError): - # TestTuple(b=1, c=4) + with self.assertRaises(TypeError): + TestTuple(b=1, c=4) + + def test_duplicate_value(self): + TestTuple = namedtuple('TestTuple', ('a', 'b', 'c'), defaults=[3]) with self.assertRaises(TypeError): TestTuple(1, 2, 3, c=4)