diff --git a/README.md b/README.md index 8e300bd..462c8f4 100644 --- a/README.md +++ b/README.md @@ -52,4 +52,6 @@ for row in queryset.aggregate(CPUStats.cpu_id, average=F.avg(CPUStats.cpu_percen print('CPU {row.cpu_id}: {row.average:.2f}%'.format(row=row)) ``` +This and other examples can be found in the `examples` folder. + To learn more please visit the [documentation](docs/toc.md). diff --git a/examples/cpu_usage/.gitignore b/examples/cpu_usage/.gitignore new file mode 100644 index 0000000..47def24 --- /dev/null +++ b/examples/cpu_usage/.gitignore @@ -0,0 +1 @@ +/env/ diff --git a/examples/cpu_usage/README.md b/examples/cpu_usage/README.md new file mode 100644 index 0000000..c72ef52 --- /dev/null +++ b/examples/cpu_usage/README.md @@ -0,0 +1,22 @@ +# CPU Usage + +This basic example uses `psutil` to collect a simple time-series of per-CPU usage percent. It then prints out some aggregate statistics based on the collected data. + +## Running the code + +Create a virtualenv and install the required libraries: +``` +virtualenv -p python3.6 env +source env/bin/activate +pip install -r requirements.txt +``` + +Run the `collect` script to populate the database with the CPU statistics. Let it run for a bit before pressing CTRL+C. +``` +python collect.py +``` + +Run the `results` script to display the CPU statistics: +``` +python results.py +``` diff --git a/examples/cpu_usage/collect.py b/examples/cpu_usage/collect.py new file mode 100644 index 0000000..34ee5b4 --- /dev/null +++ b/examples/cpu_usage/collect.py @@ -0,0 +1,20 @@ +import psutil, time, datetime +from infi.clickhouse_orm import Database +from models import CPUStats + + +db = Database('demo') +db.create_table(CPUStats) + + +psutil.cpu_percent(percpu=True) # first sample should be discarded + +while True: + time.sleep(1) + stats = psutil.cpu_percent(percpu=True) + timestamp = datetime.datetime.now() + print(timestamp) + db.insert([ + CPUStats(timestamp=timestamp, cpu_id=cpu_id, cpu_percent=cpu_percent) + for cpu_id, cpu_percent in enumerate(stats) + ]) diff --git a/examples/cpu_usage/models.py b/examples/cpu_usage/models.py new file mode 100644 index 0000000..c19007a --- /dev/null +++ b/examples/cpu_usage/models.py @@ -0,0 +1,11 @@ +from infi.clickhouse_orm import Model, DateTimeField, UInt16Field, Float32Field, Memory + + +class CPUStats(Model): + + timestamp = DateTimeField() + cpu_id = UInt16Field() + cpu_percent = Float32Field() + + engine = Memory() + diff --git a/examples/cpu_usage/requirements.txt b/examples/cpu_usage/requirements.txt new file mode 100644 index 0000000..5e08b8f --- /dev/null +++ b/examples/cpu_usage/requirements.txt @@ -0,0 +1,2 @@ +infi.clickhouse_orm +psutil diff --git a/examples/cpu_usage/results.py b/examples/cpu_usage/results.py new file mode 100644 index 0000000..80b892f --- /dev/null +++ b/examples/cpu_usage/results.py @@ -0,0 +1,13 @@ +from infi.clickhouse_orm import Database, F +from models import CPUStats + + +db = Database('demo') +queryset = CPUStats.objects_in(db) +total = queryset.filter(CPUStats.cpu_id == 1).count() +busy = queryset.filter(CPUStats.cpu_id == 1, CPUStats.cpu_percent > 95).count() +print('CPU 1 was busy {:.2f}% of the time'.format(busy * 100.0 / total)) + +# Calculate the average usage per CPU +for row in queryset.aggregate(CPUStats.cpu_id, average=F.avg(CPUStats.cpu_percent)): + print('CPU {row.cpu_id}: {row.average:.2f}%'.format(row=row)) diff --git a/examples/full_text_search/.gitignore b/examples/full_text_search/.gitignore new file mode 100644 index 0000000..f6a740e --- /dev/null +++ b/examples/full_text_search/.gitignore @@ -0,0 +1,2 @@ +/ebooks/ +/env/ diff --git a/examples/full_text_search/README.md b/examples/full_text_search/README.md new file mode 100644 index 0000000..09a31ef --- /dev/null +++ b/examples/full_text_search/README.md @@ -0,0 +1,80 @@ +# Full Text Search + +This example shows how ClickHouse might be used for searching for word sequences in texts. It's a nice proof of concept, but for production use there are probably better solutions, such as Elasticsearch. + +## Running the code + +Create a virtualenv and install the required libraries: +``` +virtualenv -p python3.6 env +source env/bin/activate +pip install -r requirements.txt +``` +Run the `download_ebooks` script to download a dozen classical books from [The Gutenberg Project](http://www.gutenberg.org/): +``` +python download_ebooks.py +``` +Run the `load` script to populate the database with the downloaded texts: +``` +python load.py +``` +And finally, run the full text search: +``` + python search.py "cheshire cat" + ``` +Asterisks can be used as wildcards (each asterisk stands for one word): +``` + python search.py "much * than" + ``` + +## How it works + +The `models.py` file defines an ORM model for storing each word in the indexed texts: +```python +class Fragment(Model): + + language = LowCardinalityField(StringField(default='EN')) + document = LowCardinalityField(StringField()) + idx = UInt64Field() + word = StringField() + stem = StringField() + + # An index for faster search by document and fragment idx + index = Index((document, idx), type=Index.minmax(), granularity=1) + + # The primary key allows efficient lookup of stems + engine = MergeTree(order_by=(stem, document, idx), partition_key=('language',)) +``` +The `document` (name) and `idx` (running number of the word inside the document) fields identify the specific word. The `word` field stores the original word as it appears in the text, while the `stem` contains the word after normalization, and that's the field which is used for matching the search terms. Stemming the words makes the matching less strict, so that searching for "swallowed" will also find documents that mention "swallow" or "swallowing". + +Here's what some records in the fragment table might look like: + +| language | document | idx | word | stem | +|----------|-------------------------|------|------------------|---------------| +| EN | Moby Dick; or The Whale | 4510 | whenever | whenev | +| EN | Moby Dick; or The Whale | 4511 | it | it | +| EN | Moby Dick; or The Whale | 4512 | is | is | +| EN | Moby Dick; or The Whale | 4513 | a | a | +| EN | Moby Dick; or The Whale | 4514 | damp, | damp | +| EN | Moby Dick; or The Whale | 4515 | drizzly | drizzli | +| EN | Moby Dick; or The Whale | 4516 | November | novemb | +| EN | Moby Dick; or The Whale | 4517 | in | in | +| EN | Moby Dick; or The Whale | 4518 | my | my | +| EN | Moby Dick; or The Whale | 4519 | soul; | soul | + +Let's say we're looking for the terms "drizzly November". Finding the first in the sequence (after stemming it) is fast and easy: +```python +query = Fragment.objects_in(db).filter(stem='drizzli').only(Fragment.document, Fragment.idx) +``` +We're interested only in the `document` and `idx` fields, since they identify a specific word. + +To find the next word in the search terms, we need a subquery similar to the first one, with an additional condition that its index will be one greater than the index of the first word: +```python +subquery = Fragment.objects_in(db).filter(stem='novemb').only(Fragment.document, Fragment.idx) +query = query.filter(F.isIn((Fragment.document, Fragment.idx + 1), subquery)) +``` +And so on, by adding another subquery for each additional search term we can construct the whole sequence of words. + +As for wildcard support, when encountering a wildcard in the search terms we simply skip it - it does not need a subquery (since it can match any word). It only increases the index count so that the query conditions will "skip" one word in the sequence. + +The algorithm for building this compound query can be found in the `build_query` function. diff --git a/examples/full_text_search/download_ebooks.py b/examples/full_text_search/download_ebooks.py new file mode 100644 index 0000000..170d5e1 --- /dev/null +++ b/examples/full_text_search/download_ebooks.py @@ -0,0 +1,27 @@ +import requests +import os + + +def download_ebook(id): + print(id, end=' ') + # Download the ebook's text + r = requests.get('https://www.gutenberg.org/files/{id}/{id}-0.txt'.format(id=id)) + if r.status_code == 404: + print('NOT FOUND, SKIPPING') + return + r.raise_for_status() + # Find the ebook's title + text = r.content.decode('utf-8') + for line in text.splitlines(): + if line.startswith('Title:'): + title = line[6:].strip() + print(title) + # Save the ebook + with open('ebooks/{}.txt'.format(title), 'wb') as f: + f.write(r.content) + + +if __name__ == "__main__": + os.makedirs('ebooks', exist_ok=True) + for i in [1342, 11, 84, 2701, 25525, 1661, 98, 74, 43, 215, 1400, 76]: + download_ebook(i) diff --git a/examples/full_text_search/load.py b/examples/full_text_search/load.py new file mode 100644 index 0000000..7cf43b0 --- /dev/null +++ b/examples/full_text_search/load.py @@ -0,0 +1,61 @@ +import sys +import nltk +from nltk.stem.porter import PorterStemmer +from glob import glob +from infi.clickhouse_orm import Database +from models import Fragment + + +def trim_punctuation(word): + ''' + Trim punctuation characters from the beginning and end of the word + ''' + start = end = len(word) + for i in range(len(word)): + if word[i].isalnum(): + start = min(start, i) + end = i + 1 + return word[start : end] + + +def parse_file(filename): + ''' + Parses a text file at the give path. + Returns a generator of tuples (original_word, stemmed_word) + The original_word may include punctuation characters. + ''' + stemmer = PorterStemmer() + with open(filename, 'r', encoding='utf-8') as f: + for line in f: + for word in line.split(): + yield (word, stemmer.stem(trim_punctuation(word))) + + +def get_fragments(filename): + ''' + Converts a text file at the given path to a generator + of Fragment instances. + ''' + from os import path + document = path.splitext(path.basename(filename))[0] + idx = 0 + for word, stem in parse_file(filename): + idx += 1 + yield Fragment(document=document, idx=idx, word=word, stem=stem) + print('{} - {} words'.format(filename, idx)) + + +if __name__ == '__main__': + + # Load NLTK data if necessary + nltk.download('punkt') + nltk.download('wordnet') + + # Initialize database + db = Database('default') + db.create_table(Fragment) + + # Load files from the command line or everything under ebooks/ + filenames = sys.argv[1:] or glob('ebooks/*.txt') + for filename in filenames: + db.insert(get_fragments(filename), batch_size=100000) diff --git a/examples/full_text_search/models.py b/examples/full_text_search/models.py new file mode 100644 index 0000000..130fe83 --- /dev/null +++ b/examples/full_text_search/models.py @@ -0,0 +1,16 @@ +from infi.clickhouse_orm import * + + +class Fragment(Model): + + language = LowCardinalityField(StringField(), default='EN') + document = LowCardinalityField(StringField()) + idx = UInt64Field() + word = StringField() + stem = StringField() + + # An index for faster search by document and fragment idx + index = Index((document, idx), type=Index.minmax(), granularity=1) + + # The primary key allows efficient lookup of stems + engine = MergeTree(order_by=(stem, document, idx), partition_key=('language',)) diff --git a/examples/full_text_search/requirements.txt b/examples/full_text_search/requirements.txt new file mode 100644 index 0000000..6d2f877 --- /dev/null +++ b/examples/full_text_search/requirements.txt @@ -0,0 +1,4 @@ +infi.clickhouse_orm +nltk +requests +colorama diff --git a/examples/full_text_search/search.py b/examples/full_text_search/search.py new file mode 100644 index 0000000..ff5fcea --- /dev/null +++ b/examples/full_text_search/search.py @@ -0,0 +1,90 @@ +import sys +from colorama import init, Fore, Back, Style +from nltk.stem.porter import PorterStemmer +from infi.clickhouse_orm import Database, F +from models import Fragment +from load import trim_punctuation + + +# The wildcard character +WILDCARD = '*' + + +def prepare_search_terms(text): + ''' + Convert the text to search into a list of stemmed words. + ''' + stemmer = PorterStemmer() + stems = [] + for word in text.split(): + if word == WILDCARD: + stems.append(WILDCARD) + else: + stems.append(stemmer.stem(trim_punctuation(word))) + return stems + + +def build_query(db, stems): + ''' + Returns a queryset instance for finding sequences of Fragment instances + that matche the list of stemmed words. + ''' + # Start by searching for the first stemmed word + all_fragments = Fragment.objects_in(db) + query = all_fragments.filter(stem=stems[0]).only(Fragment.document, Fragment.idx) + # Add the following words to the queryset + for i, stem in enumerate(stems): + # Skip the first word (it's already in the query), and wildcards + if i == 0 or stem == WILDCARD: + continue + # Create a subquery that finds instances of the i'th word + subquery = all_fragments.filter(stem=stem).only(Fragment.document, Fragment.idx) + # Add it to the query, requiring that it will appear i places away from the first word + query = query.filter(F.isIn((Fragment.document, Fragment.idx + i), subquery)) + # Sort the results + query = query.order_by(Fragment.document, Fragment.idx) + return query + + +def get_matching_text(db, document, from_idx, to_idx, extra=5): + ''' + Reconstructs the document text between the given indexes (inclusive), + plus `extra` words before and after the match. The words that are + included in the given range are highlighted in green. + ''' + text = [] + conds = (Fragment.document == document) & (Fragment.idx >= from_idx - extra) & (Fragment.idx <= to_idx + extra) + for fragment in Fragment.objects_in(db).filter(conds).order_by('document', 'idx'): + word = fragment.word + if fragment.idx == from_idx: + word = Fore.GREEN + word + if fragment.idx == to_idx: + word = word + Style.RESET_ALL + text.append(word) + return ' '.join(text) + + +def find(db, text): + ''' + Performs the search for the given text, and prints out the matches. + ''' + stems = prepare_search_terms(text) + query = build_query(db, stems) + print('\n' + Fore.MAGENTA + str(query) + Style.RESET_ALL + '\n') + for match in query: + text = get_matching_text(db, match.document, match.idx, match.idx + len(stems) - 1) + print(Fore.CYAN + match.document + ':' + Style.RESET_ALL, text) + + +if __name__ == '__main__': + + # Initialize colored output + init() + + # Initialize database + db = Database('default') + + # Search + text = ' '.join(sys.argv[1:]) + if text: + find(db, text)