Added usage examples

2025-11-09 20:27:31 +03:00 · 2020-06-26 17:53:39 +03:00 · 2020-06-26 17:53:39 +03:00 · 40a1e21348
commit 40a1e21348
parent 633c7ee1e9
14 changed files with 351 additions and 0 deletions
--- a/README.md
+++ b/README.md
@ -52,4 +52,6 @@ for row in queryset.aggregate(CPUStats.cpu_id, average=F.avg(CPUStats.cpu_percen
    print('CPU {row.cpu_id}: {row.average:.2f}%'.format(row=row))
 ```
 This and other examples can be found in the `examples` folder.
 To learn more please visit the [documentation](docs/toc.md).
--- a/examples/cpu_usage/.gitignore
+++ b/examples/cpu_usage/.gitignore
@ -0,0 +1 @@
 /env/
--- a/examples/cpu_usage/README.md
+++ b/examples/cpu_usage/README.md
@ -0,0 +1,22 @@
 # CPU Usage
 This basic example uses `psutil` to collect a simple time-series of per-CPU usage percent. It then prints out some aggregate statistics based on the collected data.
 ## Running the code
 Create a virtualenv and install the required libraries:
 ```
 virtualenv -p python3.6 env
 source env/bin/activate
 pip install -r requirements.txt
 ```
 Run the `collect` script to populate the database with the CPU statistics. Let it run for a bit before pressing CTRL+C.
 ```
 python collect.py
 ```
 Run the `results` script to display the CPU statistics:
 ```
 python results.py
 ```
--- a/examples/cpu_usage/collect.py
+++ b/examples/cpu_usage/collect.py
@ -0,0 +1,20 @@
 import psutil, time, datetime
 from infi.clickhouse_orm import Database
 from models import CPUStats
 db = Database('demo')
 db.create_table(CPUStats)
 psutil.cpu_percent(percpu=True) # first sample should be discarded
 while True:
    time.sleep(1)
    stats = psutil.cpu_percent(percpu=True)
    timestamp = datetime.datetime.now()
    print(timestamp)
    db.insert([
        CPUStats(timestamp=timestamp, cpu_id=cpu_id, cpu_percent=cpu_percent)
        for cpu_id, cpu_percent in enumerate(stats)
    ])
--- a/examples/cpu_usage/models.py
+++ b/examples/cpu_usage/models.py
@ -0,0 +1,11 @@
 from infi.clickhouse_orm import Model, DateTimeField, UInt16Field, Float32Field, Memory
 class CPUStats(Model):
    timestamp = DateTimeField()
    cpu_id = UInt16Field()
    cpu_percent = Float32Field()
    engine = Memory()
--- a/examples/cpu_usage/requirements.txt
+++ b/examples/cpu_usage/requirements.txt
@ -0,0 +1,2 @@
 infi.clickhouse_orm
 psutil
--- a/examples/cpu_usage/results.py
+++ b/examples/cpu_usage/results.py
@ -0,0 +1,13 @@
 from infi.clickhouse_orm import Database, F
 from models import CPUStats
 db = Database('demo')
 queryset = CPUStats.objects_in(db)
 total = queryset.filter(CPUStats.cpu_id == 1).count()
 busy = queryset.filter(CPUStats.cpu_id == 1, CPUStats.cpu_percent > 95).count()
 print('CPU 1 was busy {:.2f}% of the time'.format(busy * 100.0 / total))
 # Calculate the average usage per CPU
 for row in queryset.aggregate(CPUStats.cpu_id, average=F.avg(CPUStats.cpu_percent)):
    print('CPU {row.cpu_id}: {row.average:.2f}%'.format(row=row))
--- a/examples/full_text_search/.gitignore
+++ b/examples/full_text_search/.gitignore
@ -0,0 +1,2 @@
 /ebooks/
 /env/
--- a/examples/full_text_search/README.md
+++ b/examples/full_text_search/README.md
@ -0,0 +1,80 @@
 # Full Text Search
 This example shows how ClickHouse might be used for searching for word sequences in texts. It's a nice proof of concept, but for production use there are probably better solutions, such as Elasticsearch.
 ## Running the code
 Create a virtualenv and install the required libraries:
 ```
 virtualenv -p python3.6 env
 source env/bin/activate
 pip install -r requirements.txt
 ```
 Run the `download_ebooks` script to download a dozen classical books from [The Gutenberg Project](http://www.gutenberg.org/):
 ```
 python download_ebooks.py
 ```
 Run the `load` script to populate the database with the downloaded texts:
 ```
 python load.py
 ```
 And finally, run the full text search:
 ```
 python search.py "cheshire cat"
 ```
 Asterisks can be used as wildcards (each asterisk stands for one word):
 ```
 python search.py "much * than"
 ```
 ## How it works
 The `models.py` file defines an ORM model for storing each word in the indexed texts:
 ```python
 class Fragment(Model):
    language = LowCardinalityField(StringField(default='EN'))
    document = LowCardinalityField(StringField())
    idx      = UInt64Field()
    word     = StringField()
    stem     = StringField()
    # An index for faster search by document and fragment idx
    index    = Index((document, idx), type=Index.minmax(), granularity=1)
    # The primary key allows efficient lookup of stems
    engine   = MergeTree(order_by=(stem, document, idx), partition_key=('language',))
 ```
 The `document` (name) and `idx` (running number of the word inside the document) fields identify the specific word. The `word` field stores the original word as it appears in the text, while the `stem` contains the word after normalization, and that's the field which is used for matching the search terms. Stemming the words makes the matching less strict, so that searching for "swallowed" will also find documents that mention "swallow" or "swallowing".
 Here's what some records in the fragment table might look like:
 | language | document                | idx  | word             | stem          |
 |----------|-------------------------|------|------------------|---------------|
 | EN       | Moby Dick; or The Whale | 4510 | whenever         | whenev        |
 | EN       | Moby Dick; or The Whale | 4511 | it               | it            |
 | EN       | Moby Dick; or The Whale | 4512 | is               | is            |
 | EN       | Moby Dick; or The Whale | 4513 | a                | a             |
 | EN       | Moby Dick; or The Whale | 4514 | damp,            | damp          |
 | EN       | Moby Dick; or The Whale | 4515 | drizzly          | drizzli       |
 | EN       | Moby Dick; or The Whale | 4516 | November         | novemb        |
 | EN       | Moby Dick; or The Whale | 4517 | in               | in            |
 | EN       | Moby Dick; or The Whale | 4518 | my               | my            |
 | EN       | Moby Dick; or The Whale | 4519 | soul;            | soul          |
 Let's say we're looking for the terms "drizzly November". Finding the first in the sequence (after stemming it) is fast and easy:
 ```python
 query = Fragment.objects_in(db).filter(stem='drizzli').only(Fragment.document, Fragment.idx)
 ```
 We're interested only in the `document` and `idx` fields, since they identify a specific word.
 To find the next word in the search terms, we need a subquery similar to the first one, with an additional condition that its index will be one greater than the index of the first word:
 ```python
 subquery = Fragment.objects_in(db).filter(stem='novemb').only(Fragment.document, Fragment.idx)
 query = query.filter(F.isIn((Fragment.document, Fragment.idx + 1), subquery))
 ```
 And so on, by adding another subquery for each additional search term we can construct the whole sequence of words.
 As for wildcard support, when encountering a wildcard in the search terms we simply skip it - it does not need a subquery (since it can match any word). It only increases the index count so that the query conditions will "skip" one word in the sequence.
 The algorithm for building this compound query can be found in the `build_query` function.
--- a/examples/full_text_search/download_ebooks.py
+++ b/examples/full_text_search/download_ebooks.py
@ -0,0 +1,27 @@
 import requests
 import os
 def download_ebook(id):
    print(id, end=' ')
    # Download the ebook's text
    r = requests.get('https://www.gutenberg.org/files/{id}/{id}-0.txt'.format(id=id))
    if r.status_code == 404:
        print('NOT FOUND, SKIPPING')
        return
    r.raise_for_status()
    # Find the ebook's title
    text = r.content.decode('utf-8')
    for line in text.splitlines():
        if line.startswith('Title:'):
            title = line[6:].strip()
    print(title)
    # Save the ebook
    with open('ebooks/{}.txt'.format(title), 'wb') as f:
        f.write(r.content)
 if __name__ == "__main__":
    os.makedirs('ebooks', exist_ok=True)
    for i in [1342, 11, 84, 2701, 25525, 1661, 98, 74, 43, 215, 1400, 76]:
        download_ebook(i)
--- a/examples/full_text_search/load.py
+++ b/examples/full_text_search/load.py
@ -0,0 +1,61 @@
 import sys
 import nltk
 from nltk.stem.porter import PorterStemmer
 from glob import glob
 from infi.clickhouse_orm import Database
 from models import Fragment
 def trim_punctuation(word):
    '''
    Trim punctuation characters from the beginning and end of the word
    '''
    start = end = len(word)
    for i in range(len(word)):
        if word[i].isalnum():
            start = min(start, i)
            end = i + 1
    return word[start : end]
 def parse_file(filename):
    '''
    Parses a text file at the give path.
    Returns a generator of tuples (original_word, stemmed_word)
    The original_word may include punctuation characters.
    '''
    stemmer = PorterStemmer()
    with open(filename, 'r', encoding='utf-8') as f:
        for line in f:
            for word in line.split():
                yield (word, stemmer.stem(trim_punctuation(word)))
 def get_fragments(filename):
    '''
    Converts a text file at the given path to a generator
    of Fragment instances.
    '''
    from os import path
    document = path.splitext(path.basename(filename))[0]
    idx = 0
    for word, stem in parse_file(filename):
        idx += 1
        yield Fragment(document=document, idx=idx, word=word, stem=stem)
    print('{} - {} words'.format(filename, idx))
 if __name__ == '__main__':
    # Load NLTK data if necessary
    nltk.download('punkt')
    nltk.download('wordnet')
    # Initialize database
    db = Database('default')
    db.create_table(Fragment)
    # Load files from the command line or everything under ebooks/
    filenames = sys.argv[1:] or glob('ebooks/*.txt')
    for filename in filenames:
        db.insert(get_fragments(filename), batch_size=100000)
--- a/examples/full_text_search/models.py
+++ b/examples/full_text_search/models.py
@ -0,0 +1,16 @@
 from infi.clickhouse_orm import *
 class Fragment(Model):
    language = LowCardinalityField(StringField(), default='EN')
    document = LowCardinalityField(StringField())
    idx      = UInt64Field()
    word     = StringField()
    stem     = StringField()
    # An index for faster search by document and fragment idx
    index    = Index((document, idx), type=Index.minmax(), granularity=1)
    # The primary key allows efficient lookup of stems
    engine   = MergeTree(order_by=(stem, document, idx), partition_key=('language',))
--- a/examples/full_text_search/requirements.txt
+++ b/examples/full_text_search/requirements.txt
@ -0,0 +1,4 @@
 infi.clickhouse_orm
 nltk
 requests
 colorama
--- a/examples/full_text_search/search.py
+++ b/examples/full_text_search/search.py
@ -0,0 +1,90 @@
 import sys
 from colorama import init, Fore, Back, Style
 from nltk.stem.porter import PorterStemmer
 from infi.clickhouse_orm import Database, F
 from models import Fragment
 from load import trim_punctuation
 # The wildcard character
 WILDCARD = '*'
 def prepare_search_terms(text):
    '''
    Convert the text to search into a list of stemmed words.
    '''
    stemmer = PorterStemmer()
    stems = []
    for word in text.split():
        if word == WILDCARD:
            stems.append(WILDCARD)
        else:
            stems.append(stemmer.stem(trim_punctuation(word)))
    return stems
 def build_query(db, stems):
    '''
    Returns a queryset instance for finding sequences of Fragment instances
    that matche the list of stemmed words.
    '''
    # Start by searching for the first stemmed word
    all_fragments = Fragment.objects_in(db)
    query = all_fragments.filter(stem=stems[0]).only(Fragment.document, Fragment.idx)
    # Add the following words to the queryset
    for i, stem in enumerate(stems):
        # Skip the first word (it's already in the query), and wildcards
        if i == 0 or stem == WILDCARD:
            continue
        # Create a subquery that finds instances of the i'th word
        subquery = all_fragments.filter(stem=stem).only(Fragment.document, Fragment.idx)
        # Add it to the query, requiring that it will appear i places away from the first word
        query = query.filter(F.isIn((Fragment.document, Fragment.idx + i), subquery))
    # Sort the results
    query = query.order_by(Fragment.document, Fragment.idx)
    return query
 def get_matching_text(db, document, from_idx, to_idx, extra=5):
    '''
    Reconstructs the document text between the given indexes (inclusive),
    plus `extra` words before and after the match. The words that are
    included in the given range are highlighted in green.
    '''
    text = []
    conds = (Fragment.document == document) & (Fragment.idx >= from_idx - extra) & (Fragment.idx <= to_idx + extra)
    for fragment in Fragment.objects_in(db).filter(conds).order_by('document', 'idx'):
        word = fragment.word
        if fragment.idx == from_idx:
            word = Fore.GREEN + word
        if fragment.idx == to_idx:
            word = word + Style.RESET_ALL
        text.append(word)
    return ' '.join(text)
 def find(db, text):
    '''
    Performs the search for the given text, and prints out the matches.
    '''
    stems = prepare_search_terms(text)
    query = build_query(db, stems)
    print('\n' + Fore.MAGENTA + str(query) + Style.RESET_ALL + '\n')
    for match in query:
        text = get_matching_text(db, match.document, match.idx, match.idx + len(stems) - 1)
        print(Fore.CYAN + match.document + ':' + Style.RESET_ALL, text)
 if __name__ == '__main__':
    # Initialize colored output
    init()
    # Initialize database
    db = Database('default')
    # Search
    text = ' '.join(sys.argv[1:])
    if text:
        find(db, text)