Added usage examples

This commit is contained in:
Itai Shirav 2020-06-26 17:53:39 +03:00
parent 633c7ee1e9
commit 40a1e21348
14 changed files with 351 additions and 0 deletions

View File

@ -52,4 +52,6 @@ for row in queryset.aggregate(CPUStats.cpu_id, average=F.avg(CPUStats.cpu_percen
print('CPU {row.cpu_id}: {row.average:.2f}%'.format(row=row))
```
This and other examples can be found in the `examples` folder.
To learn more please visit the [documentation](docs/toc.md).

1
examples/cpu_usage/.gitignore vendored Normal file
View File

@ -0,0 +1 @@
/env/

View File

@ -0,0 +1,22 @@
# CPU Usage
This basic example uses `psutil` to collect a simple time-series of per-CPU usage percent. It then prints out some aggregate statistics based on the collected data.
## Running the code
Create a virtualenv and install the required libraries:
```
virtualenv -p python3.6 env
source env/bin/activate
pip install -r requirements.txt
```
Run the `collect` script to populate the database with the CPU statistics. Let it run for a bit before pressing CTRL+C.
```
python collect.py
```
Run the `results` script to display the CPU statistics:
```
python results.py
```

View File

@ -0,0 +1,20 @@
import psutil, time, datetime
from infi.clickhouse_orm import Database
from models import CPUStats
db = Database('demo')
db.create_table(CPUStats)
psutil.cpu_percent(percpu=True) # first sample should be discarded
while True:
time.sleep(1)
stats = psutil.cpu_percent(percpu=True)
timestamp = datetime.datetime.now()
print(timestamp)
db.insert([
CPUStats(timestamp=timestamp, cpu_id=cpu_id, cpu_percent=cpu_percent)
for cpu_id, cpu_percent in enumerate(stats)
])

View File

@ -0,0 +1,11 @@
from infi.clickhouse_orm import Model, DateTimeField, UInt16Field, Float32Field, Memory
class CPUStats(Model):
timestamp = DateTimeField()
cpu_id = UInt16Field()
cpu_percent = Float32Field()
engine = Memory()

View File

@ -0,0 +1,2 @@
infi.clickhouse_orm
psutil

View File

@ -0,0 +1,13 @@
from infi.clickhouse_orm import Database, F
from models import CPUStats
db = Database('demo')
queryset = CPUStats.objects_in(db)
total = queryset.filter(CPUStats.cpu_id == 1).count()
busy = queryset.filter(CPUStats.cpu_id == 1, CPUStats.cpu_percent > 95).count()
print('CPU 1 was busy {:.2f}% of the time'.format(busy * 100.0 / total))
# Calculate the average usage per CPU
for row in queryset.aggregate(CPUStats.cpu_id, average=F.avg(CPUStats.cpu_percent)):
print('CPU {row.cpu_id}: {row.average:.2f}%'.format(row=row))

2
examples/full_text_search/.gitignore vendored Normal file
View File

@ -0,0 +1,2 @@
/ebooks/
/env/

View File

@ -0,0 +1,80 @@
# Full Text Search
This example shows how ClickHouse might be used for searching for word sequences in texts. It's a nice proof of concept, but for production use there are probably better solutions, such as Elasticsearch.
## Running the code
Create a virtualenv and install the required libraries:
```
virtualenv -p python3.6 env
source env/bin/activate
pip install -r requirements.txt
```
Run the `download_ebooks` script to download a dozen classical books from [The Gutenberg Project](http://www.gutenberg.org/):
```
python download_ebooks.py
```
Run the `load` script to populate the database with the downloaded texts:
```
python load.py
```
And finally, run the full text search:
```
python search.py "cheshire cat"
```
Asterisks can be used as wildcards (each asterisk stands for one word):
```
python search.py "much * than"
```
## How it works
The `models.py` file defines an ORM model for storing each word in the indexed texts:
```python
class Fragment(Model):
language = LowCardinalityField(StringField(default='EN'))
document = LowCardinalityField(StringField())
idx = UInt64Field()
word = StringField()
stem = StringField()
# An index for faster search by document and fragment idx
index = Index((document, idx), type=Index.minmax(), granularity=1)
# The primary key allows efficient lookup of stems
engine = MergeTree(order_by=(stem, document, idx), partition_key=('language',))
```
The `document` (name) and `idx` (running number of the word inside the document) fields identify the specific word. The `word` field stores the original word as it appears in the text, while the `stem` contains the word after normalization, and that's the field which is used for matching the search terms. Stemming the words makes the matching less strict, so that searching for "swallowed" will also find documents that mention "swallow" or "swallowing".
Here's what some records in the fragment table might look like:
| language | document | idx | word | stem |
|----------|-------------------------|------|------------------|---------------|
| EN | Moby Dick; or The Whale | 4510 | whenever | whenev |
| EN | Moby Dick; or The Whale | 4511 | it | it |
| EN | Moby Dick; or The Whale | 4512 | is | is |
| EN | Moby Dick; or The Whale | 4513 | a | a |
| EN | Moby Dick; or The Whale | 4514 | damp, | damp |
| EN | Moby Dick; or The Whale | 4515 | drizzly | drizzli |
| EN | Moby Dick; or The Whale | 4516 | November | novemb |
| EN | Moby Dick; or The Whale | 4517 | in | in |
| EN | Moby Dick; or The Whale | 4518 | my | my |
| EN | Moby Dick; or The Whale | 4519 | soul; | soul |
Let's say we're looking for the terms "drizzly November". Finding the first in the sequence (after stemming it) is fast and easy:
```python
query = Fragment.objects_in(db).filter(stem='drizzli').only(Fragment.document, Fragment.idx)
```
We're interested only in the `document` and `idx` fields, since they identify a specific word.
To find the next word in the search terms, we need a subquery similar to the first one, with an additional condition that its index will be one greater than the index of the first word:
```python
subquery = Fragment.objects_in(db).filter(stem='novemb').only(Fragment.document, Fragment.idx)
query = query.filter(F.isIn((Fragment.document, Fragment.idx + 1), subquery))
```
And so on, by adding another subquery for each additional search term we can construct the whole sequence of words.
As for wildcard support, when encountering a wildcard in the search terms we simply skip it - it does not need a subquery (since it can match any word). It only increases the index count so that the query conditions will "skip" one word in the sequence.
The algorithm for building this compound query can be found in the `build_query` function.

View File

@ -0,0 +1,27 @@
import requests
import os
def download_ebook(id):
print(id, end=' ')
# Download the ebook's text
r = requests.get('https://www.gutenberg.org/files/{id}/{id}-0.txt'.format(id=id))
if r.status_code == 404:
print('NOT FOUND, SKIPPING')
return
r.raise_for_status()
# Find the ebook's title
text = r.content.decode('utf-8')
for line in text.splitlines():
if line.startswith('Title:'):
title = line[6:].strip()
print(title)
# Save the ebook
with open('ebooks/{}.txt'.format(title), 'wb') as f:
f.write(r.content)
if __name__ == "__main__":
os.makedirs('ebooks', exist_ok=True)
for i in [1342, 11, 84, 2701, 25525, 1661, 98, 74, 43, 215, 1400, 76]:
download_ebook(i)

View File

@ -0,0 +1,61 @@
import sys
import nltk
from nltk.stem.porter import PorterStemmer
from glob import glob
from infi.clickhouse_orm import Database
from models import Fragment
def trim_punctuation(word):
'''
Trim punctuation characters from the beginning and end of the word
'''
start = end = len(word)
for i in range(len(word)):
if word[i].isalnum():
start = min(start, i)
end = i + 1
return word[start : end]
def parse_file(filename):
'''
Parses a text file at the give path.
Returns a generator of tuples (original_word, stemmed_word)
The original_word may include punctuation characters.
'''
stemmer = PorterStemmer()
with open(filename, 'r', encoding='utf-8') as f:
for line in f:
for word in line.split():
yield (word, stemmer.stem(trim_punctuation(word)))
def get_fragments(filename):
'''
Converts a text file at the given path to a generator
of Fragment instances.
'''
from os import path
document = path.splitext(path.basename(filename))[0]
idx = 0
for word, stem in parse_file(filename):
idx += 1
yield Fragment(document=document, idx=idx, word=word, stem=stem)
print('{} - {} words'.format(filename, idx))
if __name__ == '__main__':
# Load NLTK data if necessary
nltk.download('punkt')
nltk.download('wordnet')
# Initialize database
db = Database('default')
db.create_table(Fragment)
# Load files from the command line or everything under ebooks/
filenames = sys.argv[1:] or glob('ebooks/*.txt')
for filename in filenames:
db.insert(get_fragments(filename), batch_size=100000)

View File

@ -0,0 +1,16 @@
from infi.clickhouse_orm import *
class Fragment(Model):
language = LowCardinalityField(StringField(), default='EN')
document = LowCardinalityField(StringField())
idx = UInt64Field()
word = StringField()
stem = StringField()
# An index for faster search by document and fragment idx
index = Index((document, idx), type=Index.minmax(), granularity=1)
# The primary key allows efficient lookup of stems
engine = MergeTree(order_by=(stem, document, idx), partition_key=('language',))

View File

@ -0,0 +1,4 @@
infi.clickhouse_orm
nltk
requests
colorama

View File

@ -0,0 +1,90 @@
import sys
from colorama import init, Fore, Back, Style
from nltk.stem.porter import PorterStemmer
from infi.clickhouse_orm import Database, F
from models import Fragment
from load import trim_punctuation
# The wildcard character
WILDCARD = '*'
def prepare_search_terms(text):
'''
Convert the text to search into a list of stemmed words.
'''
stemmer = PorterStemmer()
stems = []
for word in text.split():
if word == WILDCARD:
stems.append(WILDCARD)
else:
stems.append(stemmer.stem(trim_punctuation(word)))
return stems
def build_query(db, stems):
'''
Returns a queryset instance for finding sequences of Fragment instances
that matche the list of stemmed words.
'''
# Start by searching for the first stemmed word
all_fragments = Fragment.objects_in(db)
query = all_fragments.filter(stem=stems[0]).only(Fragment.document, Fragment.idx)
# Add the following words to the queryset
for i, stem in enumerate(stems):
# Skip the first word (it's already in the query), and wildcards
if i == 0 or stem == WILDCARD:
continue
# Create a subquery that finds instances of the i'th word
subquery = all_fragments.filter(stem=stem).only(Fragment.document, Fragment.idx)
# Add it to the query, requiring that it will appear i places away from the first word
query = query.filter(F.isIn((Fragment.document, Fragment.idx + i), subquery))
# Sort the results
query = query.order_by(Fragment.document, Fragment.idx)
return query
def get_matching_text(db, document, from_idx, to_idx, extra=5):
'''
Reconstructs the document text between the given indexes (inclusive),
plus `extra` words before and after the match. The words that are
included in the given range are highlighted in green.
'''
text = []
conds = (Fragment.document == document) & (Fragment.idx >= from_idx - extra) & (Fragment.idx <= to_idx + extra)
for fragment in Fragment.objects_in(db).filter(conds).order_by('document', 'idx'):
word = fragment.word
if fragment.idx == from_idx:
word = Fore.GREEN + word
if fragment.idx == to_idx:
word = word + Style.RESET_ALL
text.append(word)
return ' '.join(text)
def find(db, text):
'''
Performs the search for the given text, and prints out the matches.
'''
stems = prepare_search_terms(text)
query = build_query(db, stems)
print('\n' + Fore.MAGENTA + str(query) + Style.RESET_ALL + '\n')
for match in query:
text = get_matching_text(db, match.document, match.idx, match.idx + len(stems) - 1)
print(Fore.CYAN + match.document + ':' + Style.RESET_ALL, text)
if __name__ == '__main__':
# Initialize colored output
init()
# Initialize database
db = Database('default')
# Search
text = ' '.join(sys.argv[1:])
if text:
find(db, text)