infi.clickhouse_orm/examples/full_text_search/load.py

import sys
import nltk
from nltk.stem.porter import PorterStemmer
from glob import glob
from infi.clickhouse_orm import Database
from models import Fragment


def trim_punctuation(word):
    '''
    Trim punctuation characters from the beginning and end of the word
    '''
    start = end = len(word)
    for i in range(len(word)):
        if word[i].isalnum():
            start = min(start, i)
            end = i + 1
    return word[start : end]


def parse_file(filename):
    '''
    Parses a text file at the give path.
    Returns a generator of tuples (original_word, stemmed_word)
    The original_word may include punctuation characters.
    '''
    stemmer = PorterStemmer()
    with open(filename, 'r', encoding='utf-8') as f:
        for line in f:
            for word in line.split():
                yield (word, stemmer.stem(trim_punctuation(word)))


def get_fragments(filename):
    '''
    Converts a text file at the given path to a generator
    of Fragment instances.
    '''
    from os import path
    document = path.splitext(path.basename(filename))[0]
    idx = 0
    for word, stem in parse_file(filename):
        idx += 1
        yield Fragment(document=document, idx=idx, word=word, stem=stem)
    print('{} - {} words'.format(filename, idx))


if __name__ == '__main__':

    # Load NLTK data if necessary
    nltk.download('punkt')
    nltk.download('wordnet')

    # Initialize database
    db = Database('default')
    db.create_table(Fragment)

    # Load files from the command line or everything under ebooks/
    filenames = sys.argv[1:] or glob('ebooks/*.txt')
    for filename in filenames:
        db.insert(get_fragments(filename), batch_size=100000)