@ -52,4 +52,6 @@ for row in queryset.aggregate(CPUStats.cpu_id, average=F.avg(CPUStats.cpu_percen
print('CPU {row.cpu_id}: {row.average:.2f}%'.format(row=row))
This and other examples can be found in the `examples` folder.
To learn more please visit the [documentation](docs/

@ -0,0 +1 @@

@ -0,0 +1,22 @@
# CPU Usage
This basic example uses `psutil` to collect a simple time-series of per-CPU usage percent. It then prints out some aggregate statistics based on the collected data.
## Running the code
Create a virtualenv and install the required libraries:
virtualenv -p python3.6 env
source env/bin/activate
pip install -r requirements.txt
Run the `collect` script to populate the database with the CPU statistics. Let it run for a bit before pressing CTRL+C.
Run the `results` script to display the CPU statistics:

@ -0,0 +1,20 @@
import psutil, time, datetime
from infi.clickhouse_orm import Database
from models import CPUStats
db = Database('demo')
psutil.cpu_percent(percpu=True) # first sample should be discarded
while True:
stats = psutil.cpu_percent(percpu=True)
timestamp =
CPUStats(timestamp=timestamp, cpu_id=cpu_id, cpu_percent=cpu_percent)
for cpu_id, cpu_percent in enumerate(stats)

@ -0,0 +1,11 @@
from infi.clickhouse_orm import Model, DateTimeField, UInt16Field, Float32Field, Memory
class CPUStats(Model):
timestamp = DateTimeField()
cpu_id = UInt16Field()
cpu_percent = Float32Field()
engine = Memory()

@ -0,0 +1,2 @@

@ -0,0 +1,13 @@
from infi.clickhouse_orm import Database, F
from models import CPUStats
db = Database('demo')
queryset = CPUStats.objects_in(db)
total = queryset.filter(CPUStats.cpu_id == 1).count()
busy = queryset.filter(CPUStats.cpu_id == 1, CPUStats.cpu_percent > 95).count()
print('CPU 1 was busy {:.2f}% of the time'.format(busy * 100.0 / total))
# Calculate the average usage per CPU
for row in queryset.aggregate(CPUStats.cpu_id, average=F.avg(CPUStats.cpu_percent)):
print('CPU {row.cpu_id}: {row.average:.2f}%'.format(row=row))

@ -0,0 +1,2 @@

@ -0,0 +1,80 @@
# Full Text Search
This example shows how ClickHouse might be used for searching for word sequences in texts. It's a nice proof of concept, but for production use there are probably better solutions, such as Elasticsearch.
## Running the code
Create a virtualenv and install the required libraries:
virtualenv -p python3.6 env
source env/bin/activate
pip install -r requirements.txt
Run the `download_ebooks` script to download a dozen classical books from [The Gutenberg Project](
Run the `load` script to populate the database with the downloaded texts:
And finally, run the full text search:
python "cheshire cat"
Asterisks can be used as wildcards (each asterisk stands for one word):
python "much * than"
## How it works
The `` file defines an ORM model for storing each word in the indexed texts:
class Fragment(Model):
language = LowCardinalityField(StringField(default='EN'))
document = LowCardinalityField(StringField())
idx = UInt64Field()
word = StringField()
stem = StringField()
# An index for faster search by document and fragment idx
index = Index((document, idx), type=Index.minmax(), granularity=1)
# The primary key allows efficient lookup of stems
engine = MergeTree(order_by=(stem, document, idx), partition_key=('language',))
The `document` (name) and `idx` (running number of the word inside the document) fields identify the specific word. The `word` field stores the original word as it appears in the text, while the `stem` contains the word after normalization, and that's the field which is used for matching the search terms. Stemming the words makes the matching less strict, so that searching for "swallowed" will also find documents that mention "swallow" or "swallowing".
Here's what some records in the fragment table might look like:
| language | document | idx | word | stem |
| EN | Moby Dick; or The Whale | 4510 | whenever | whenev |
| EN | Moby Dick; or The Whale | 4511 | it | it |
| EN | Moby Dick; or The Whale | 4512 | is | is |
| EN | Moby Dick; or The Whale | 4513 | a | a |
| EN | Moby Dick; or The Whale | 4514 | damp, | damp |
| EN | Moby Dick; or The Whale | 4515 | drizzly | drizzli |
| EN | Moby Dick; or The Whale | 4516 | November | novemb |
| EN | Moby Dick; or The Whale | 4517 | in | in |
| EN | Moby Dick; or The Whale | 4518 | my | my |
| EN | Moby Dick; or The Whale | 4519 | soul; | soul |
Let's say we're looking for the terms "drizzly November". Finding the first in the sequence (after stemming it) is fast and easy:
query = Fragment.objects_in(db).filter(stem='drizzli').only(Fragment.document, Fragment.idx)
We're interested only in the `document` and `idx` fields, since they identify a specific word.
To find the next word in the search terms, we need a subquery similar to the first one, with an additional condition that its index will be one greater than the index of the first word:
subquery = Fragment.objects_in(db).filter(stem='novemb').only(Fragment.document, Fragment.idx)
query = query.filter(F.isIn((Fragment.document, Fragment.idx + 1), subquery))
And so on, by adding another subquery for each additional search term we can construct the whole sequence of words.
As for wildcard support, when encountering a wildcard in the search terms we simply skip it - it does not need a subquery (since it can match any word). It only increases the index count so that the query conditions will "skip" one word in the sequence.
The algorithm for building this compound query can be found in the `build_query` function.

@ -0,0 +1,27 @@
import requests
import os
def download_ebook(id):
print(id, end=' ')
# Download the ebook's text
r = requests.get('{id}/{id}-0.txt'.format(id=id))
if r.status_code == 404:
# Find the ebook's title
text = r.content.decode('utf-8')
for line in text.splitlines():
if line.startswith('Title:'):
title = line[6:].strip()
# Save the ebook
with open('ebooks/{}.txt'.format(title), 'wb') as f:
if __name__ == "__main__":
os.makedirs('ebooks', exist_ok=True)
for i in [1342, 11, 84, 2701, 25525, 1661, 98, 74, 43, 215, 1400, 76]:

@ -0,0 +1,61 @@
import sys
import nltk
from nltk.stem.porter import PorterStemmer
from glob import glob
from infi.clickhouse_orm import Database
from models import Fragment
def trim_punctuation(word):
Trim punctuation characters from the beginning and end of the word
start = end = len(word)
for i in range(len(word)):
if word[i].isalnum():
start = min(start, i)
end = i + 1
return word[start : end]
def parse_file(filename):
Parses a text file at the give path.
Returns a generator of tuples (original_word, stemmed_word)
The original_word may include punctuation characters.
stemmer = PorterStemmer()
with open(filename, 'r', encoding='utf-8') as f:
for line in f:
for word in line.split():
yield (word, stemmer.stem(trim_punctuation(word)))
def get_fragments(filename):
Converts a text file at the given path to a generator
of Fragment instances.
from os import path
document = path.splitext(path.basename(filename))[0]
idx = 0
for word, stem in parse_file(filename):
idx += 1
yield Fragment(document=document, idx=idx, word=word, stem=stem)
print('{} - {} words'.format(filename, idx))
if __name__ == '__main__':
# Load NLTK data if necessary'punkt')'wordnet')
# Initialize database
db = Database('default')
# Load files from the command line or everything under ebooks/
filenames = sys.argv[1:] or glob('ebooks/*.txt')
for filename in filenames:
db.insert(get_fragments(filename), batch_size=100000)

@ -0,0 +1,16 @@
from infi.clickhouse_orm import *
class Fragment(Model):
language = LowCardinalityField(StringField(), default='EN')
document = LowCardinalityField(StringField())
idx = UInt64Field()
word = StringField()
stem = StringField()
# An index for faster search by document and fragment idx
index = Index((document, idx), type=Index.minmax(), granularity=1)
# The primary key allows efficient lookup of stems
engine = MergeTree(order_by=(stem, document, idx), partition_key=('language',))

@ -0,0 +1,4 @@

@ -0,0 +1,90 @@
import sys
from colorama import init, Fore, Back, Style
from nltk.stem.porter import PorterStemmer
from infi.clickhouse_orm import Database, F
from models import Fragment
from load import trim_punctuation
# The wildcard character
def prepare_search_terms(text):
Convert the text to search into a list of stemmed words.
stemmer = PorterStemmer()
stems = []
for word in text.split():
if word == WILDCARD:
return stems
def build_query(db, stems):
Returns a queryset instance for finding sequences of Fragment instances
that matche the list of stemmed words.
# Start by searching for the first stemmed word
all_fragments = Fragment.objects_in(db)
query = all_fragments.filter(stem=stems[0]).only(Fragment.document, Fragment.idx)
# Add the following words to the queryset
for i, stem in enumerate(stems):
# Skip the first word (it's already in the query), and wildcards
if i == 0 or stem == WILDCARD:
# Create a subquery that finds instances of the i'th word
subquery = all_fragments.filter(stem=stem).only(Fragment.document, Fragment.idx)
# Add it to the query, requiring that it will appear i places away from the first word
query = query.filter(F.isIn((Fragment.document, Fragment.idx + i), subquery))
# Sort the results
query = query.order_by(Fragment.document, Fragment.idx)
return query
def get_matching_text(db, document, from_idx, to_idx, extra=5):
Reconstructs the document text between the given indexes (inclusive),
plus `extra` words before and after the match. The words that are
included in the given range are highlighted in green.
text = []
conds = (Fragment.document == document) & (Fragment.idx >= from_idx - extra) & (Fragment.idx <= to_idx + extra)
for fragment in Fragment.objects_in(db).filter(conds).order_by('document', 'idx'):
word = fragment.word
if fragment.idx == from_idx:
word = Fore.GREEN + word
if fragment.idx == to_idx:
word = word + Style.RESET_ALL
return ' '.join(text)
def find(db, text):
Performs the search for the given text, and prints out the matches.
stems = prepare_search_terms(text)
query = build_query(db, stems)
print('\n' + Fore.MAGENTA + str(query) + Style.RESET_ALL + '\n')
for match in query:
text = get_matching_text(db, match.document, match.idx, match.idx + len(stems) - 1)
print(Fore.CYAN + match.document + ':' + Style.RESET_ALL, text)
if __name__ == '__main__':
# Initialize colored output
# Initialize database
db = Database('default')
# Search
text = ' '.join(sys.argv[1:])
if text:
find(db, text)