infi.clickhouse_orm/examples/full_text_search/search.py

import sys
from colorama import init, Fore, Back, Style
from nltk.stem.porter import PorterStemmer
from infi.clickhouse_orm import Database, F
from models import Fragment
from load import trim_punctuation


# The wildcard character
WILDCARD = '*'


def prepare_search_terms(text):
    '''
    Convert the text to search into a list of stemmed words.
    '''
    stemmer = PorterStemmer()
    stems = []
    for word in text.split():
        if word == WILDCARD:
            stems.append(WILDCARD)
        else:
            stems.append(stemmer.stem(trim_punctuation(word)))
    return stems


def build_query(db, stems):
    '''
    Returns a queryset instance for finding sequences of Fragment instances
    that matche the list of stemmed words.
    '''
    # Start by searching for the first stemmed word
    all_fragments = Fragment.objects_in(db)
    query = all_fragments.filter(stem=stems[0]).only(Fragment.document, Fragment.idx)
    # Add the following words to the queryset
    for i, stem in enumerate(stems):
        # Skip the first word (it's already in the query), and wildcards
        if i == 0 or stem == WILDCARD:
            continue
        # Create a subquery that finds instances of the i'th word
        subquery = all_fragments.filter(stem=stem).only(Fragment.document, Fragment.idx)
        # Add it to the query, requiring that it will appear i places away from the first word
        query = query.filter(F.isIn((Fragment.document, Fragment.idx + i), subquery))
    # Sort the results
    query = query.order_by(Fragment.document, Fragment.idx)
    return query


def get_matching_text(db, document, from_idx, to_idx, extra=5):
    '''
    Reconstructs the document text between the given indexes (inclusive),
    plus `extra` words before and after the match. The words that are
    included in the given range are highlighted in green.
    '''
    text = []
    conds = (Fragment.document == document) & (Fragment.idx >= from_idx - extra) & (Fragment.idx <= to_idx + extra)
    for fragment in Fragment.objects_in(db).filter(conds).order_by('document', 'idx'):
        word = fragment.word
        if fragment.idx == from_idx:
            word = Fore.GREEN + word
        if fragment.idx == to_idx:
            word = word + Style.RESET_ALL
        text.append(word)
    return ' '.join(text)


def find(db, text):
    '''
    Performs the search for the given text, and prints out the matches.
    '''
    stems = prepare_search_terms(text)
    query = build_query(db, stems)
    print('\n' + Fore.MAGENTA + str(query) + Style.RESET_ALL + '\n')
    for match in query:
        text = get_matching_text(db, match.document, match.idx, match.idx + len(stems) - 1)
        print(Fore.CYAN + match.document + ':' + Style.RESET_ALL, text)


if __name__ == '__main__':

    # Initialize colored output
    init()

    # Initialize database
    db = Database('default')

    # Search
    text = ' '.join(sys.argv[1:])
    if text:
        find(db, text)